; $Id: sin.asm $
;; @file
; IPRT - No-CRT sin - AMD64 & X86.
;

;
; Copyright (C) 2006-2023 Oracle and/or its affiliates.
;
; This file is part of VirtualBox base platform packages, as
; available from https://www.virtualbox.org.
;
; This program is free software; you can redistribute it and/or
; modify it under the terms of the GNU General Public License
; as published by the Free Software Foundation, in version 3 of the
; License.
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, see <https://www.gnu.org/licenses>.
;
; The contents of this file may alternatively be used under the terms
; of the Common Development and Distribution License Version 1.0
; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
; in the VirtualBox distribution, in which case the provisions of the
; CDDL are applicable instead of those of the GPL.
;
; You may elect to license modified versions of this file under the
; terms and conditions of either the GPL or the CDDL or both.
;
; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
;


%define RT_ASM_WITH_SEH64
%include "iprt/asmdefs.mac"
%include "iprt/x86.mac"


BEGINCODE


;;
; Compute the sine of rf, measured in radians.
;
; @returns  st(0) / xmm0
; @param    rf      [rbp + xCB*2] / xmm0
;
RT_NOCRT_BEGINPROC sin
        push    xBP
        SEH64_PUSH_xBP
        mov     xBP, xSP
        SEH64_SET_FRAME_xBP 0
        sub     xSP, 20h
        SEH64_ALLOCATE_STACK 20h
        SEH64_END_PROLOGUE

%ifdef RT_OS_WINDOWS
        ;
        ; Make sure we use full precision and not the windows default of 53 bits.
        ;
        fnstcw  [xBP - 20h]
        mov     ax, [xBP - 20h]
        or      ax, X86_FCW_PC_64       ; includes both bits, so no need to clear the mask.
        mov     [xBP - 1ch], ax
        fldcw   [xBP - 1ch]
%endif

        ;
        ; Load the input into st0.
        ;
%ifdef RT_ARCH_AMD64
        movsd   [xBP - 10h], xmm0
        fld     qword [xBP - 10h]
%else
        fld     qword [xBP + xCB*2]
%endif

        ;
        ; We examin the input and weed out non-finit numbers first.
        ;
        fxam
        fnstsw  ax
        and     ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0
        cmp     ax, X86_FSW_C2              ; Normal finite number (excluding zero)
        je      .finite
        cmp     ax, X86_FSW_C3              ; Zero
        je      .zero
        cmp     ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero.
        je      .zero
        cmp     ax, X86_FSW_C0              ; NaN - must handle it special,
        je      .nan

        ; Pass infinities and unsupported inputs to fsin, assuming it does the right thing.
.do_sin:
        fsin
        jmp     .return_val

        ;
        ; Finite number.
        ;
.finite:
        ; For very tiny numbers, 0 < abs(input) < 2**-25, we can return the
        ; input value directly.
        fld     st0                         ; duplicate st0
        fabs                                ; make it an absolute (positive) value.
        fld     qword [.s_r64Tiny xWrtRIP]
        fcomip  st1                         ; compare s_r64Tiny and fabs(input)
        ja      .return_tiny_number_as_is   ; jump if fabs(input) is smaller

        ; FSIN is documented to be reasonable for the range ]-3pi/4,3pi/4[, so
        ; while we have fabs(input) loaded already, check for that here and
        ; allow rtNoCrtMathSinCore to assume it won't see values very close to
        ; zero, except by cos -> sin conversion where they won't be relevant to
        ; any assumpttions about precision approximation.
        fld     qword [.s_r64FSinOkay xWrtRIP]
        fcomip  st1
        ffreep  st0                         ; drop the fabs(input) value
        ja      .do_sin

        ;
        ; Call common sine/cos worker.
        ;
        mov     ecx, 1                      ; double
        extern  NAME(rtNoCrtMathSinCore)
        call    NAME(rtNoCrtMathSinCore)

        ;
        ; Run st0.
        ;
.return_val:
%ifdef RT_ARCH_AMD64
        fstp    qword [xBP - 10h]
        movsd   xmm0, [xBP - 10h]
%endif
%ifdef RT_OS_WINDOWS
        fldcw   [xBP - 20h]                 ; restore original
%endif
.return:
        leave
        ret

        ;
        ; As explained already, we can return tiny numbers directly too as the
        ; output from sin(input) = input given our precision.
        ; We can skip the st0 -> xmm0 translation here, so follow the same path
        ; as .zero & .nan, after we've removed the fabs(input) value.
        ;
.return_tiny_number_as_is:
        ffreep  st0

        ;
        ; sin(+/-0.0) = +/-0.0 (preserve the sign)
        ; We can skip the st0 -> xmm0 translation here, so follow the .nan code path.
        ;
.zero:

        ;
        ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
        ; to QNaN when masked).
        ;
.nan:
%ifdef RT_ARCH_AMD64
        ffreep  st0
%endif
        jmp     .return

ALIGNCODE(8)
        ; Ca. 2**-17, absolute value. Inputs closer to zero than this can be
        ; returns directly as the sin(input) value should be basically the same
        ; given the precision we're working with and FSIN probably won't even
        ; manage that.
        ;; @todo experiment when FSIN gets better than this.
.s_r64Tiny:
        dq      0.00000762939453125
        ; The absolute limit of FSIN "good" range.
.s_r64FSinOkay:
        dq      2.356194490192344928845 ; 3pi/4
        ;dq      1.57079632679489661923  ; pi/2 - alternative.

ENDPROC   RT_NOCRT(sin)