// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
#![allow(unused_unsafe)]
// The functions are complex with many branches, and explicit
// `return`s makes it clear where function exit points are
#![allow(clippy::needless_return)]
#![allow(clippy::comparison_chain)]
// Clippy is confused by the complex configuration
#![allow(clippy::if_same_then_else)]
#![allow(clippy::needless_bool)]

//! This `specialized_div_rem` module is originally from version 1.0.0 of the
//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
//! module, since unoptimized compilation may generate references to `memcpy`.
//!
//! The purpose of these macros is to easily change the both the division algorithm used
//! for a given integer size and the half division used by that algorithm. The way
//! functions call each other is also constructed such that linkers will find the chain of
//! software and hardware divisions needed for every size of signed and unsigned division.
//! For example, most target compilations do the following:
//!
//!  - Many 128 bit division functions like `u128::wrapping_div` use
//!    `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
//!    is not a 128 bit by 128 bit hardware division function in most architectures.
//!    `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
//!    `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
//!    one function to calculate both the quotient and remainder. If configuration flags
//!    enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
//!    which requires the half sized division `u64_by_u64_div_rem`. If the architecture
//!    supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
//!    reduced to those instructions. Note that we do not specify the half size division
//!    directly to be `__udivdi3`, because hardware division would never be introduced.
//!  - If the architecture does not supply a 64 bit hardware division instruction, u64
//!    divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
//!    which is defined by `impl_delegate!`. The half division for this algorithm is
//!    `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
//!    software division algorithms.
//!  - If the architecture does not supply a 32 bit hardware instruction, linkers will
//!    look for `__udivsi3`. `impl_binary_long!` is used, but this  algorithm uses no half
//!    division, so the chain of calls ends here.
//!
//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
//! extend the 128 by 64 bit division to a full 128 by 128 bit division.

// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
// ridiculously complex

#[macro_use]
mod norm_shift;

#[macro_use]
mod binary_long;

#[macro_use]
mod delegate;

// used on SPARC
#[allow(unused_imports)]
#[cfg(not(feature = "public-test-deps"))]
pub(crate) use self::delegate::u128_divide_sparc;

#[cfg(feature = "public-test-deps")]
pub use self::delegate::u128_divide_sparc;

#[macro_use]
mod trifecta;

#[macro_use]
mod asymmetric;

/// The behavior of all divisions by zero is controlled by this function. This function should be
/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
fn zero_div_fn() -> ! {
    unsafe { core::hint::unreachable_unchecked() }
}

const USE_LZ: bool = {
    if cfg!(target_arch = "arm") {
        if cfg!(target_feature = "thumb-mode") {
            // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
            // supported. This is needed to successfully differentiate between targets like
            // `thumbv8.base` and `thumbv8.main`.
            cfg!(target_feature = "v6t2")
        } else {
            // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
            // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
            // feature does not seem to work.
            cfg!(target_feature = "v5te")
        }
    } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
        // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
        cfg!(target_feature = "vis3")
    } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
        // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
        cfg!(target_feature = "b")
    } else {
        // All other common targets Rust supports should have CLZ instructions
        true
    }
};

impl_normalization_shift!(
    u32_normalization_shift,
    USE_LZ,
    32,
    u32,
    i32,
    allow(dead_code)
);
impl_normalization_shift!(
    u64_normalization_shift,
    USE_LZ,
    64,
    u64,
    i64,
    allow(dead_code)
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
    if let Some(quo) = duo.checked_div(div) {
        if let Some(rem) = duo.checked_rem(div) {
            return (quo, rem);
        }
    }
    zero_div_fn()
}

// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
// faster if the target pointer width is at least 64.
#[cfg(all(
    not(any(target_pointer_width = "16", target_pointer_width = "32")),
    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
    not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_trifecta!(
    u128_div_rem,
    zero_div_fn,
    u64_by_u64_div_rem,
    32,
    u32,
    u64,
    u128
);

// If the pointer width less than 64, then the target architecture almost certainly does not have
// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
#[cfg(all(
    any(target_pointer_width = "16", target_pointer_width = "32"),
    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
    not(any(target_arch = "sparc", target_arch = "sparc64"))
))]
impl_delegate!(
    u128_div_rem,
    zero_div_fn,
    u64_normalization_shift,
    u64_by_u64_div_rem,
    32,
    u32,
    u64,
    u128,
    i128
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u64`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
#[inline]
unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
    let duo_lo = duo as u64;
    let duo_hi = (duo >> 64) as u64;
    let quo: u64;
    let rem: u64;
    unsafe {
        // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
        // by `div`. The quotient is stored in rax and the remainder in rdx.
        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
        core::arch::asm!(
            "div {0}",
            in(reg) div,
            inlateout("rax") duo_lo => quo,
            inlateout("rdx") duo_hi => rem,
            options(att_syntax, pure, nomem, nostack)
        );
    }
    (quo, rem)
}

// use `asymmetric` instead of `trifecta` on x86_64
#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
impl_asymmetric!(
    u128_div_rem,
    zero_div_fn,
    u64_by_u64_div_rem,
    u128_by_u64_div_rem,
    32,
    u32,
    u64,
    u128
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
/// dependencies.
#[inline]
#[allow(dead_code)]
fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
    if let Some(quo) = duo.checked_div(div) {
        if let Some(rem) = duo.checked_rem(div) {
            return (quo, rem);
        }
    }
    zero_div_fn()
}

// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
// than register size.
#[cfg(all(
    not(all(not(feature = "no-asm"), target_arch = "x86")),
    not(target_pointer_width = "64")
))]
impl_delegate!(
    u64_div_rem,
    zero_div_fn,
    u32_normalization_shift,
    u32_by_u32_div_rem,
    16,
    u16,
    u32,
    u64,
    i64
);

// When not on x86 and the pointer width is 64, use `binary_long`.
#[cfg(all(
    not(all(not(feature = "no-asm"), target_arch = "x86")),
    target_pointer_width = "64"
))]
impl_binary_long!(
    u64_div_rem,
    zero_div_fn,
    u64_normalization_shift,
    64,
    u64,
    i64
);

/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
///
/// # Safety
///
/// If the quotient does not fit in a `u32`, a floating point exception occurs.
/// If `div == 0`, then a division by zero exception occurs.
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
#[inline]
unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
    let duo_lo = duo as u32;
    let duo_hi = (duo >> 32) as u32;
    let quo: u32;
    let rem: u32;
    unsafe {
        // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
        // by `div`. The quotient is stored in rax and the remainder in rdx.
        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
        core::arch::asm!(
            "div {0}",
            in(reg) div,
            inlateout("rax") duo_lo => quo,
            inlateout("rdx") duo_hi => rem,
            options(att_syntax, pure, nomem, nostack)
        );
    }
    (quo, rem)
}

// use `asymmetric` instead of `delegate` on x86
#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
impl_asymmetric!(
    u64_div_rem,
    zero_div_fn,
    u32_by_u32_div_rem,
    u64_by_u32_div_rem,
    16,
    u16,
    u32,
    u64
);

// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
impl_binary_long!(
    u32_div_rem,
    zero_div_fn,
    u32_normalization_shift,
    32,
    u32,
    i32
);