; $Id: cos.asm $
;; @file
; IPRT - No-CRT cos - AMD64 & X86.
;

;
; Copyright (C) 2006-2023 Oracle and/or its affiliates.
;
; This file is part of VirtualBox base platform packages, as
; available from https://www.virtualbox.org.
;
; This program is free software; you can redistribute it and/or
; modify it under the terms of the GNU General Public License
; as published by the Free Software Foundation, in version 3 of the
; License.
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
; General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, see <https://www.gnu.org/licenses>.
;
; The contents of this file may alternatively be used under the terms
; of the Common Development and Distribution License Version 1.0
; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
; in the VirtualBox distribution, in which case the provisions of the
; CDDL are applicable instead of those of the GPL.
;
; You may elect to license modified versions of this file under the
; terms and conditions of either the GPL or the CDDL or both.
;
; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
;


%define RT_ASM_WITH_SEH64
%include "iprt/asmdefs.mac"
%include "iprt/x86.mac"


BEGINCODE

;;
; Compute the cosine of rd, measured in radians.
;
; @returns  st(0) / xmm0
; @param    rd      [rbp + xCB*2] / xmm0
;
RT_NOCRT_BEGINPROC cos
        push    xBP
        SEH64_PUSH_xBP
        mov     xBP, xSP
        SEH64_SET_FRAME_xBP 0
        sub     xSP, 20h
        SEH64_ALLOCATE_STACK 20h
        SEH64_END_PROLOGUE

%ifdef RT_OS_WINDOWS
        ;
        ; Make sure we use full precision and not the windows default of 53 bits.
        ;
;; @todo not sure if this makes any difference...
        fnstcw  [xBP - 20h]
        mov     ax, [xBP - 20h]
        or      ax, X86_FCW_PC_64       ; includes both bits, so no need to clear the mask.
        mov     [xBP - 1ch], ax
        fldcw   [xBP - 1ch]
%endif

        ;
        ; Load the input into st0.
        ;
%ifdef RT_ARCH_AMD64
        movsd   [xBP - 10h], xmm0
        fld     qword [xBP - 10h]
%else
        fld     qword [xBP + xCB*2]
%endif

        ;
        ; The FCOS instruction has a very narrow range (-3pi/8 to 3pi/8) where it
        ; works reliably, so outside that we'll use the FSIN instruction instead
        ; as it has a larger good range (-5pi/4 to 1pi/4 for cosine).
        ; Input conversion follows: cos(x) = sin(x + pi/2)
        ;
        ; We examin the input and weed out non-finit numbers first.
        ;

        ; We only do the range check on normal finite numbers.
        fxam
        fnstsw  ax
        and     ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0
        cmp     ax, X86_FSW_C2              ; Normal finite number (excluding zero)
        je      .finite
        cmp     ax, X86_FSW_C3              ; Zero
        je      .zero
        cmp     ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero.
        je      .zero
        cmp     ax, X86_FSW_C0              ; NaN - must handle it special,
        je      .nan

        ; Pass infinities and unsupported inputs to fcos, assuming it does the right thing.
        ; We also jump here if we get a finite number in the "good" range, see below.
.do_fcos:
        fcos
        jmp     .return_val

        ;
        ; Finite number.
        ;
        ; First check if it's a very tiny number where we can simply return 1.
        ; Next check if it's in the range where FCOS is reasonable, otherwise
        ; go to FSIN to do the work.
        ;
.finite:
        fld     st0
        fabs
        fld     qword [.s_r64TinyCosTo1 xWrtRIP]
        fcomip  st1
        ja      .zero_extra_pop

.not_that_tiny_input:
        fld     qword [.s_r64FCosOkay xWrtRIP]
        fcomip  st1
        ffreep  st0                         ; pop fabs(input)
        ja      .do_fcos                    ; jmp if fabs(input) < .s_r64FCosOkay

        ;
        ; If we have a positive number we subtract 3pi/2, for negative we add pi/2.
        ; We still have the FXAM result in AX.
        ;
.outside_fcos_range:
        test    ax, X86_FSW_C1              ; The sign bit.
        jnz     .adjust_negative_to_sine

        ; Calc -3pi/2 using FPU-internal pi constant.
        fldpi
        fadd    st0, st0                    ; st0=2pi
        fldpi
        fdiv    qword [.s_r64Two xWrtRIP]   ; st1=2pi; st0=pi/2
        fsubp   st1, st0                    ; st0=3pi/2
        fchs                                ; st0=-3pi/2
        jmp     .make_sine_adjustment

.adjust_negative_to_sine:
        ; Calc +pi/2.
        fldpi
        fdiv    qword [.s_r64Two xWrtRIP]   ; st1=2pi; st0=pi/2

.make_sine_adjustment:
        faddp   st1, st0

        ;
        ; Call internal sine worker to calculate st0=sin(st0)
        ;
.do_sine:
        mov     ecx, 1                      ; double
        extern  NAME(rtNoCrtMathSinCore)
        call    NAME(rtNoCrtMathSinCore)

        ;
        ; Return st0.
        ;
.return_val:
%ifdef RT_ARCH_AMD64
        fstp    qword [xBP - 10h]
        movsd   xmm0, [xBP - 10h]
%endif
%ifdef RT_OS_WINDOWS
        fldcw   [xBP - 20h]                 ; restore original
%endif
.return:
        leave
        ret

        ;
        ; cos(+/-0) = +1.0
        ;
.zero_extra_pop:
        ffreep  st0
.zero:
        ffreep  st0
        fld1
        jmp     .return_val

        ;
        ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
        ; to QNaN when masked).
        ;
.nan:
%ifdef RT_ARCH_AMD64
        ffreep  st0
%endif
        jmp     .return

        ;
        ; Local constants.
        ;
ALIGNCODE(8)
        ; About 2**-27. When fabs(input) is below this limit we can consider cos(input) ~= 1.0.
.s_r64TinyCosTo1:
        dq  7.4505806e-9

        ; The absolute limit for the range which FCOS is expected to produce reasonable results.
.s_r64FCosOkay:
        dq  1.1780972450961724644225   ; 3*pi/8

.s_r64Two:
        dq  2.0
ENDPROC   RT_NOCRT(cos)