/* $Id: timer-r0drv-linux.c $ */
/** @file
 * IPRT - Timers, Ring-0 Driver, Linux.
 */

/*
 * Copyright (C) 2006-2023 Oracle and/or its affiliates.
 *
 * This file is part of VirtualBox base platform packages, as
 * available from https://www.virtualbox.org.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, in version 3 of the
 * License.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <https://www.gnu.org/licenses>.
 *
 * The contents of this file may alternatively be used under the terms
 * of the Common Development and Distribution License Version 1.0
 * (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
 * in the VirtualBox distribution, in which case the provisions of the
 * CDDL are applicable instead of those of the GPL.
 *
 * You may elect to license modified versions of this file under the
 * terms and conditions of either the GPL or the CDDL or both.
 *
 * SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
 */


/*********************************************************************************************************************************
*   Header Files                                                                                                                 *
*********************************************************************************************************************************/
#include "the-linux-kernel.h"
#include "internal/iprt.h"

#include <iprt/timer.h>
#include <iprt/time.h>
#include <iprt/mp.h>
#include <iprt/cpuset.h>
#include <iprt/spinlock.h>
#include <iprt/err.h>
#include <iprt/asm.h>
#include <iprt/assert.h>
#include <iprt/alloc.h>

#include "internal/magics.h"

/** @def RTTIMER_LINUX_WITH_HRTIMER
 * Whether to use high resolution timers.  */
#if !defined(RTTIMER_LINUX_WITH_HRTIMER) \
    && defined(IPRT_LINUX_HAS_HRTIMER)
# define RTTIMER_LINUX_WITH_HRTIMER
#endif

#if RTLNX_VER_MAX(2,6,31)
# define mod_timer_pinned               mod_timer
# define HRTIMER_MODE_ABS_PINNED        HRTIMER_MODE_ABS
#endif


/*********************************************************************************************************************************
*   Structures and Typedefs                                                                                                      *
*********************************************************************************************************************************/
/**
 * Timer state machine.
 *
 * This is used to try handle the issues with MP events and
 * timers that runs on all CPUs. It's relatively nasty :-/
 */
typedef enum RTTIMERLNXSTATE
{
    /** Stopped. */
    RTTIMERLNXSTATE_STOPPED = 0,
    /** Transient state; next ACTIVE. */
    RTTIMERLNXSTATE_STARTING,
    /** Transient state; next ACTIVE. (not really necessary) */
    RTTIMERLNXSTATE_MP_STARTING,
    /** Active. */
    RTTIMERLNXSTATE_ACTIVE,
    /** Active and in callback; next ACTIVE, STOPPED or CALLBACK_DESTROYING. */
    RTTIMERLNXSTATE_CALLBACK,
    /** Stopped while in the callback; next STOPPED. */
    RTTIMERLNXSTATE_CB_STOPPING,
    /** Restarted while in the callback; next ACTIVE, STOPPED, DESTROYING. */
    RTTIMERLNXSTATE_CB_RESTARTING,
    /** The callback shall destroy the timer; next STOPPED. */
    RTTIMERLNXSTATE_CB_DESTROYING,
    /** Transient state; next STOPPED. */
    RTTIMERLNXSTATE_STOPPING,
    /** Transient state; next STOPPED. */
    RTTIMERLNXSTATE_MP_STOPPING,
    /** The usual 32-bit hack. */
    RTTIMERLNXSTATE_32BIT_HACK = 0x7fffffff
} RTTIMERLNXSTATE;


/**
 * A Linux sub-timer.
 */
typedef struct RTTIMERLNXSUBTIMER
{
    /** Timer specific data.  */
    union
    {
#if defined(RTTIMER_LINUX_WITH_HRTIMER)
        /** High resolution timer. */
        struct
        {
            /** The linux timer structure. */
            struct hrtimer          LnxTimer;
        } Hr;
#endif
        /** Standard timer. */
        struct
        {
            /** The linux timer structure. */
            struct timer_list       LnxTimer;
            /** The start of the current run (ns).
             * This is used to calculate when the timer ought to fire the next time. */
            uint64_t                u64NextTS;
            /** When the timer was started. */
            uint64_t                nsStartTS;
            /** The u64NextTS in jiffies. */
            unsigned long           ulNextJiffies;
            /** Set when starting or changing the timer so that u64StartTs
             *  and u64NextTS gets reinitialized (eliminating some jitter). */
            bool volatile           fFirstAfterChg;
        } Std;
    } u;
    /** The current tick number. */
    uint64_t                iTick;
    /** Restart the single shot timer at this specific time.
     * Used when a single shot timer is restarted from the callback. */
    uint64_t volatile       uNsRestartAt;
    /** Pointer to the parent timer. */
    PRTTIMER                pParent;
    /** The current sub-timer state. */
    RTTIMERLNXSTATE volatile enmState;
} RTTIMERLNXSUBTIMER;
/** Pointer to a linux sub-timer. */
typedef RTTIMERLNXSUBTIMER *PRTTIMERLNXSUBTIMER;


/**
 * The internal representation of an Linux timer handle.
 */
typedef struct RTTIMER
{
    /** Magic.
     * This is RTTIMER_MAGIC, but changes to something else before the timer
     * is destroyed to indicate clearly that thread should exit. */
    uint32_t volatile       u32Magic;
    /** Spinlock synchronizing the fSuspended and MP event handling.
     * This is NIL_RTSPINLOCK if cCpus == 1. */
    RTSPINLOCK              hSpinlock;
    /** Flag indicating that the timer is suspended. */
    bool volatile           fSuspended;
    /** Whether the timer must run on one specific CPU or not. */
    bool                    fSpecificCpu;
#ifdef CONFIG_SMP
    /** Whether the timer must run on all CPUs or not. */
    bool                    fAllCpus;
#endif /* else: All -> specific on non-SMP kernels */
    /** Whether it is a high resolution timer or a standard one. */
    bool                    fHighRes;
    /** The id of the CPU it must run on if fSpecificCpu is set. */
    RTCPUID                 idCpu;
    /** The number of CPUs this timer should run on. */
    RTCPUID                 cCpus;
    /** Callback. */
    PFNRTTIMER              pfnTimer;
    /** User argument. */
    void                   *pvUser;
    /** The timer interval. 0 if one-shot. */
    uint64_t volatile       u64NanoInterval;
    /** This is set to the number of jiffies between ticks if the interval is
     * an exact number of jiffies. (Standard timers only.) */
    unsigned long volatile  cJiffies;
    /** The change interval spinlock for standard timers only. */
    spinlock_t              ChgIntLock;
    /** Workqueue item for delayed destruction. */
    RTR0LNXWORKQUEUEITEM    DtorWorkqueueItem;
    /** Sub-timers.
     * Normally there is just one, but for RTTIMER_FLAGS_CPU_ALL this will contain
     * an entry for all possible cpus. In that case the index will be the same as
     * for the RTCpuSet. */
    RTTIMERLNXSUBTIMER      aSubTimers[1];
} RTTIMER;


/**
 * A rtTimerLinuxStartOnCpu and rtTimerLinuxStartOnCpu argument package.
 */
typedef struct RTTIMERLINUXSTARTONCPUARGS
{
    /** The current time (RTTimeSystemNanoTS). */
    uint64_t                u64Now;
    /** When to start firing (delta). */
    uint64_t                u64First;
} RTTIMERLINUXSTARTONCPUARGS;
/** Pointer to a rtTimerLinuxStartOnCpu argument package. */
typedef RTTIMERLINUXSTARTONCPUARGS *PRTTIMERLINUXSTARTONCPUARGS;


/*********************************************************************************************************************************
*   Internal Functions                                                                                                           *
*********************************************************************************************************************************/
#ifdef CONFIG_SMP
static DECLCALLBACK(void) rtTimerLinuxMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser);
#endif

#if 0
#define DEBUG_HACKING
#include <iprt/string.h>
#include <iprt/asm-amd64-x86.h>
static void myLogBackdoorPrintf(const char *pszFormat, ...)
{
    char        szTmp[256];
    va_list     args;
    size_t      cb;

    cb = RTStrPrintf(szTmp, sizeof(szTmp) - 10, "%d: ", RTMpCpuId());
    va_start(args, pszFormat);
    cb += RTStrPrintfV(&szTmp[cb], sizeof(szTmp) - cb, pszFormat, args);
    va_end(args);

    ASMOutStrU8(0x504, (uint8_t *)&szTmp[0], cb);
}
# define RTAssertMsg1Weak(pszExpr, uLine, pszFile, pszFunction) \
    myLogBackdoorPrintf("\n!!Guest Assertion failed!!\n%s(%d) %s\n%s\n", uLine, pszFile, pszFunction, (pszExpr))
# define RTAssertMsg2Weak myLogBackdoorPrintf
# define RTTIMERLNX_LOG(a)          myLogBackdoorPrintf a
#else
# define RTTIMERLNX_LOG(a)          do { } while (0)
#endif

/**
 * Sets the state.
 */
DECLINLINE(void) rtTimerLnxSetState(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState)
{
#ifdef DEBUG_HACKING
    RTTIMERLNX_LOG(("set %d -> %d\n", *penmState, enmNewState));
#endif
    ASMAtomicWriteU32((uint32_t volatile *)penmState, enmNewState);
}


/**
 * Sets the state if it has a certain value.
 *
 * @return true if xchg was done.
 * @return false if xchg wasn't done.
 */
#ifdef DEBUG_HACKING
#define rtTimerLnxCmpXchgState(penmState, enmNewState, enmCurState) rtTimerLnxCmpXchgStateDebug(penmState, enmNewState, enmCurState, __LINE__)
static bool rtTimerLnxCmpXchgStateDebug(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState,
                                        RTTIMERLNXSTATE enmCurState, uint32_t uLine)
{
    RTTIMERLNXSTATE enmOldState = enmCurState;
    bool fRc = ASMAtomicCmpXchgExU32((uint32_t volatile *)penmState, enmNewState, enmCurState, (uint32_t *)&enmOldState);
    RTTIMERLNX_LOG(("cxg %d -> %d - %d at %u\n", enmOldState, enmNewState, fRc, uLine));
    return fRc;
}
#else
DECLINLINE(bool) rtTimerLnxCmpXchgState(RTTIMERLNXSTATE volatile *penmState, RTTIMERLNXSTATE enmNewState,
                                        RTTIMERLNXSTATE enmCurState)
{
    return ASMAtomicCmpXchgU32((uint32_t volatile *)penmState, enmNewState, enmCurState);
}
#endif


/**
 * Gets the state.
 */
DECLINLINE(RTTIMERLNXSTATE) rtTimerLnxGetState(RTTIMERLNXSTATE volatile *penmState)
{
    return (RTTIMERLNXSTATE)ASMAtomicUoReadU32((uint32_t volatile *)penmState);
}

#ifdef RTTIMER_LINUX_WITH_HRTIMER

/**
 * Converts a nano second time stamp to ktime_t.
 *
 * ASSUMES RTTimeSystemNanoTS() is implemented using ktime_get_ts().
 *
 * @returns ktime_t.
 * @param   cNanoSecs   Nanoseconds.
 */
DECLINLINE(ktime_t) rtTimerLnxNanoToKt(uint64_t cNanoSecs)
{
    /* With some luck the compiler optimizes the division out of this... (Bet it doesn't.) */
    return ktime_set(cNanoSecs / 1000000000, cNanoSecs % 1000000000);
}

/**
 * Converts ktime_t to a nano second time stamp.
 *
 * ASSUMES RTTimeSystemNanoTS() is implemented using ktime_get_ts().
 *
 * @returns nano second time stamp.
 * @param   Kt          ktime_t.
 */
DECLINLINE(uint64_t) rtTimerLnxKtToNano(ktime_t Kt)
{
    return ktime_to_ns(Kt);
}

#endif /* RTTIMER_LINUX_WITH_HRTIMER */

/**
 * Converts a nano second interval to jiffies.
 *
 * @returns Jiffies.
 * @param   cNanoSecs   Nanoseconds.
 */
DECLINLINE(unsigned long) rtTimerLnxNanoToJiffies(uint64_t cNanoSecs)
{
    /* this can be made even better... */
    if (cNanoSecs > (uint64_t)TICK_NSEC * MAX_JIFFY_OFFSET)
        return MAX_JIFFY_OFFSET;
# if ARCH_BITS == 32
    if (RT_LIKELY(cNanoSecs <= UINT32_MAX))
        return ((uint32_t)cNanoSecs + (TICK_NSEC-1)) / TICK_NSEC;
# endif
    return (cNanoSecs + (TICK_NSEC-1)) / TICK_NSEC;
}


/**
 * Starts a sub-timer (RTTimerStart).
 *
 * @param   pSubTimer   The sub-timer to start.
 * @param   u64Now      The current timestamp (RTTimeSystemNanoTS()).
 * @param   u64First    The interval from u64Now to the first time the timer should fire.
 * @param   fPinned     true = timer pinned to a specific CPU,
 *                      false = timer can migrate between CPUs
 * @param   fHighRes    Whether the user requested a high resolution timer or not.
 * @param   enmOldState The old timer state.
 */
static void rtTimerLnxStartSubTimer(PRTTIMERLNXSUBTIMER pSubTimer, uint64_t u64Now, uint64_t u64First,
                                    bool fPinned, bool fHighRes)
{
    /*
     * Calc when it should start firing.
     */
    uint64_t u64NextTS = u64Now + u64First;
    if (!fHighRes)
    {
        pSubTimer->u.Std.u64NextTS = u64NextTS;
        pSubTimer->u.Std.nsStartTS = u64NextTS;
    }
    RTTIMERLNX_LOG(("startsubtimer %p\n", pSubTimer->pParent));

    pSubTimer->iTick = 0;

#ifdef RTTIMER_LINUX_WITH_HRTIMER
    if (fHighRes)
        hrtimer_start(&pSubTimer->u.Hr.LnxTimer, rtTimerLnxNanoToKt(u64NextTS),
                      fPinned ? HRTIMER_MODE_ABS_PINNED : HRTIMER_MODE_ABS);
    else
#endif
    {
        unsigned long cJiffies = !u64First ? 0 : rtTimerLnxNanoToJiffies(u64First);
        pSubTimer->u.Std.ulNextJiffies  = jiffies + cJiffies;
        pSubTimer->u.Std.fFirstAfterChg = true;
#ifdef CONFIG_SMP
        if (fPinned)
        {
# if RTLNX_VER_MIN(4,8,0)
            mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
# else
            mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
# endif
        }
        else
#endif
            mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
    }

    /* Be a bit careful here since we could be racing the callback. */
    if (!rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_STARTING))
        rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_MP_STARTING);
}


/**
 * Stops a sub-timer (RTTimerStart and rtTimerLinuxMpEvent()).
 *
 * The caller has already changed the state, so we will not be in a callback
 * situation wrt to the calling thread.
 *
 * @param   pSubTimer   The sub-timer.
 * @param   fHighRes    Whether the user requested a high resolution timer or not.
 */
static void rtTimerLnxStopSubTimer(PRTTIMERLNXSUBTIMER pSubTimer, bool fHighRes)
{
    RTTIMERLNX_LOG(("stopsubtimer %p %d\n", pSubTimer->pParent, fHighRes));
#ifdef RTTIMER_LINUX_WITH_HRTIMER
    if (fHighRes)
    {
        /* There is no equivalent to del_timer in the hrtimer API,
           hrtimer_cancel() == del_timer_sync().  Just like the WARN_ON in
           del_timer_sync() asserts, waiting for a timer callback to complete
           is deadlock prone, so don't do it.  */
        int rc = hrtimer_try_to_cancel(&pSubTimer->u.Hr.LnxTimer);
        if (rc < 0)
        {
            hrtimer_start(&pSubTimer->u.Hr.LnxTimer, ktime_set(KTIME_SEC_MAX, 0), HRTIMER_MODE_ABS);
            hrtimer_try_to_cancel(&pSubTimer->u.Hr.LnxTimer);
        }
    }
    else
#endif
        del_timer(&pSubTimer->u.Std.LnxTimer);

    rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED);
}


/**
 * Used by RTTimerDestroy and rtTimerLnxCallbackDestroy to do the actual work.
 *
 * @param   pTimer  The timer in question.
 */
static void rtTimerLnxDestroyIt(PRTTIMER pTimer)
{
    RTSPINLOCK  hSpinlock = pTimer->hSpinlock;
    RTCPUID     iCpu;
    Assert(pTimer->fSuspended);
    RTTIMERLNX_LOG(("destroyit %p\n", pTimer));

    /*
     * Remove the MP notifications first because it'll reduce the risk of
     * us overtaking any MP event that might theoretically be racing us here.
     */
#ifdef CONFIG_SMP
    if (    pTimer->cCpus > 1
        &&  hSpinlock != NIL_RTSPINLOCK)
    {
        int rc = RTMpNotificationDeregister(rtTimerLinuxMpEvent, pTimer);
        AssertRC(rc);
    }
#endif /* CONFIG_SMP */

    /*
     * Invalidate the handle.
     */
    ASMAtomicWriteU32(&pTimer->u32Magic, ~RTTIMER_MAGIC);

    /*
     * Make sure all timers have stopped executing since we're stopping them in
     * an asynchronous manner up in rtTimerLnxStopSubTimer.
     */
    iCpu = pTimer->cCpus;
    while (iCpu-- > 0)
    {
#ifdef RTTIMER_LINUX_WITH_HRTIMER
        if (pTimer->fHighRes)
            hrtimer_cancel(&pTimer->aSubTimers[iCpu].u.Hr.LnxTimer);
        else
#endif
            del_timer_sync(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer);
    }

    /*
     * Finally, free the resources.
     */
    RTMemFreeEx(pTimer, RT_UOFFSETOF_DYN(RTTIMER, aSubTimers[pTimer->cCpus]));
    if (hSpinlock != NIL_RTSPINLOCK)
        RTSpinlockDestroy(hSpinlock);
}


/**
 * Workqueue callback (no DECLCALLBACK!) for deferred destruction.
 *
 * @param   pWork        Pointer to the DtorWorkqueueItem member of our timer
 *                       structure.
 */
static void rtTimerLnxDestroyDeferred(RTR0LNXWORKQUEUEITEM *pWork)
{
    PRTTIMER pTimer = RT_FROM_MEMBER(pWork, RTTIMER, DtorWorkqueueItem);
    rtTimerLnxDestroyIt(pTimer);
}


/**
 * Called when the timer was destroyed by the callback function.
 *
 * @param   pTimer      The timer.
 * @param   pSubTimer   The sub-timer which we're handling, the state of this
 *                      will be RTTIMERLNXSTATE_CALLBACK_DESTROYING.
 */
static void rtTimerLnxCallbackDestroy(PRTTIMER pTimer, PRTTIMERLNXSUBTIMER pSubTimer)
{
    /*
     * If it's an omni timer, the last dude does the destroying.
     */
    if (pTimer->cCpus > 1)
    {
        uint32_t        iCpu        = pTimer->cCpus;
        RTSpinlockAcquire(pTimer->hSpinlock);

        Assert(pSubTimer->enmState == RTTIMERLNXSTATE_CB_DESTROYING);
        rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED);

        while (iCpu-- > 0)
            if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) != RTTIMERLNXSTATE_STOPPED)
            {
                RTSpinlockRelease(pTimer->hSpinlock);
                return;
            }

        RTSpinlockRelease(pTimer->hSpinlock);
    }

    /*
     * Destroying a timer from the callback is unsafe since the callout code
     * might be touching the timer structure upon return (hrtimer does!).  So,
     * we have to defer the actual destruction to the IRPT workqueue.
     */
    rtR0LnxWorkqueuePush(&pTimer->DtorWorkqueueItem, rtTimerLnxDestroyDeferred);
}


#ifdef CONFIG_SMP
/**
 * Deal with a sub-timer that has migrated.
 *
 * @param   pTimer          The timer.
 * @param   pSubTimer       The sub-timer.
 */
static void rtTimerLnxCallbackHandleMigration(PRTTIMER pTimer, PRTTIMERLNXSUBTIMER pSubTimer)
{
    RTTIMERLNXSTATE enmState;
    if (pTimer->cCpus > 1)
        RTSpinlockAcquire(pTimer->hSpinlock);

    do
    {
        enmState = rtTimerLnxGetState(&pSubTimer->enmState);
        switch (enmState)
        {
            case RTTIMERLNXSTATE_STOPPING:
            case RTTIMERLNXSTATE_MP_STOPPING:
                enmState = RTTIMERLNXSTATE_STOPPED;
                RT_FALL_THRU();
            case RTTIMERLNXSTATE_STOPPED:
                break;

            default:
                AssertMsgFailed(("%d\n", enmState));
                RT_FALL_THRU();
            case RTTIMERLNXSTATE_STARTING:
            case RTTIMERLNXSTATE_MP_STARTING:
            case RTTIMERLNXSTATE_ACTIVE:
            case RTTIMERLNXSTATE_CALLBACK:
            case RTTIMERLNXSTATE_CB_STOPPING:
            case RTTIMERLNXSTATE_CB_RESTARTING:
                if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, enmState))
                    enmState = RTTIMERLNXSTATE_STOPPED;
                break;

            case RTTIMERLNXSTATE_CB_DESTROYING:
            {
                if (pTimer->cCpus > 1)
                    RTSpinlockRelease(pTimer->hSpinlock);

                rtTimerLnxCallbackDestroy(pTimer, pSubTimer);
                return;
            }
        }
    } while (enmState != RTTIMERLNXSTATE_STOPPED);

    if (pTimer->cCpus > 1)
        RTSpinlockRelease(pTimer->hSpinlock);
}
#endif /* CONFIG_SMP */


/**
 * The slow path of rtTimerLnxChangeToCallbackState.
 *
 * @returns true if changed successfully, false if not.
 * @param   pSubTimer       The sub-timer.
 */
static bool rtTimerLnxChangeToCallbackStateSlow(PRTTIMERLNXSUBTIMER pSubTimer)
{
    for (;;)
    {
        RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState);
        switch (enmState)
        {
            case RTTIMERLNXSTATE_ACTIVE:
            case RTTIMERLNXSTATE_STARTING:
            case RTTIMERLNXSTATE_MP_STARTING:
                if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CALLBACK, enmState))
                    return true;
                break;

            case RTTIMERLNXSTATE_CALLBACK:
            case RTTIMERLNXSTATE_CB_STOPPING:
            case RTTIMERLNXSTATE_CB_RESTARTING:
            case RTTIMERLNXSTATE_CB_DESTROYING:
                AssertMsgFailed(("%d\n", enmState)); RT_FALL_THRU();
            default:
                return false;
        }
        ASMNopPause();
    }
}


/**
 * Tries to change the sub-timer state to 'callback'.
 *
 * @returns true if changed successfully, false if not.
 * @param   pSubTimer       The sub-timer.
 */
DECLINLINE(bool) rtTimerLnxChangeToCallbackState(PRTTIMERLNXSUBTIMER pSubTimer)
{
    if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CALLBACK, RTTIMERLNXSTATE_ACTIVE)))
        return true;
    return rtTimerLnxChangeToCallbackStateSlow(pSubTimer);
}


#ifdef RTTIMER_LINUX_WITH_HRTIMER
/**
 * Timer callback function for high resolution timers.
 *
 * @returns HRTIMER_NORESTART or HRTIMER_RESTART depending on whether it's a
 *          one-shot or interval timer.
 * @param   pHrTimer    Pointer to the sub-timer structure.
 */
static enum hrtimer_restart rtTimerLinuxHrCallback(struct hrtimer *pHrTimer)
{
    PRTTIMERLNXSUBTIMER     pSubTimer = RT_FROM_MEMBER(pHrTimer, RTTIMERLNXSUBTIMER, u.Hr.LnxTimer);
    PRTTIMER                pTimer    = pSubTimer->pParent;


    RTTIMERLNX_LOG(("hrcallback %p\n", pTimer));
    if (RT_UNLIKELY(!rtTimerLnxChangeToCallbackState(pSubTimer)))
        return HRTIMER_NORESTART;

#ifdef CONFIG_SMP
    /*
     * Check for unwanted migration.
     */
    if (pTimer->fAllCpus || pTimer->fSpecificCpu)
    {
        RTCPUID idCpu = RTMpCpuId();
        if (RT_UNLIKELY(  pTimer->fAllCpus
                        ? (RTCPUID)(pSubTimer - &pTimer->aSubTimers[0]) != idCpu
                        : pTimer->idCpu != idCpu))
        {
            rtTimerLnxCallbackHandleMigration(pTimer, pSubTimer);
            return HRTIMER_NORESTART;
        }
    }
#endif

    if (pTimer->u64NanoInterval)
    {
        /*
         * Periodic timer, run it and update the native timer afterwards so
         * we can handle RTTimerStop and RTTimerChangeInterval from the
         * callback as well as a racing control thread.
         */
        pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick);
        hrtimer_add_expires_ns(&pSubTimer->u.Hr.LnxTimer, ASMAtomicReadU64(&pTimer->u64NanoInterval));
        if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CALLBACK)))
            return HRTIMER_RESTART;
    }
    else
    {
        /*
         * One shot timer (no omni), stop it before dispatching it.
         * Allow RTTimerStart as well as RTTimerDestroy to be called from
         * the callback.
         */
        ASMAtomicWriteBool(&pTimer->fSuspended, true);
        pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick);
        if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CALLBACK)))
            return HRTIMER_NORESTART;
    }

    /*
     * Some state change occurred while we were in the callback routine.
     */
    for (;;)
    {
        RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState);
        switch (enmState)
        {
            case RTTIMERLNXSTATE_CB_DESTROYING:
                rtTimerLnxCallbackDestroy(pTimer, pSubTimer);
                return HRTIMER_NORESTART;

            case RTTIMERLNXSTATE_CB_STOPPING:
                if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CB_STOPPING))
                    return HRTIMER_NORESTART;
                break;

            case RTTIMERLNXSTATE_CB_RESTARTING:
                if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CB_RESTARTING))
                {
                    pSubTimer->iTick = 0;
                    hrtimer_set_expires(&pSubTimer->u.Hr.LnxTimer, rtTimerLnxNanoToKt(pSubTimer->uNsRestartAt));
                    return HRTIMER_RESTART;
                }
                break;

            default:
                AssertMsgFailed(("%d\n", enmState));
                return HRTIMER_NORESTART;
        }
        ASMNopPause();
    }
}
#endif /* RTTIMER_LINUX_WITH_HRTIMER */


#if RTLNX_VER_MIN(4,15,0)
/**
 * Timer callback function for standard timers.
 *
 * @param   pLnxTimer   Pointer to the Linux timer structure.
 */
static void rtTimerLinuxStdCallback(struct timer_list *pLnxTimer)
{
    PRTTIMERLNXSUBTIMER pSubTimer = from_timer(pSubTimer, pLnxTimer, u.Std.LnxTimer);
#else
/**
 * Timer callback function for standard timers.
 *
 * @param   ulUser      Address of the sub-timer structure.
 */
static void rtTimerLinuxStdCallback(unsigned long ulUser)
{
    PRTTIMERLNXSUBTIMER pSubTimer = (PRTTIMERLNXSUBTIMER)ulUser;
#endif
    PRTTIMER            pTimer    = pSubTimer->pParent;

    RTTIMERLNX_LOG(("stdcallback %p\n", pTimer));
    if (RT_UNLIKELY(!rtTimerLnxChangeToCallbackState(pSubTimer)))
        return;

#ifdef CONFIG_SMP
    /*
     * Check for unwanted migration.
     */
    if (pTimer->fAllCpus || pTimer->fSpecificCpu)
    {
        RTCPUID idCpu = RTMpCpuId();
        if (RT_UNLIKELY(  pTimer->fAllCpus
                        ? (RTCPUID)(pSubTimer - &pTimer->aSubTimers[0]) != idCpu
                        : pTimer->idCpu != idCpu))
        {
            rtTimerLnxCallbackHandleMigration(pTimer, pSubTimer);
            return;
        }
    }
#endif

    if (pTimer->u64NanoInterval)
    {
        /*
         * Interval timer, calculate the next timeout.
         *
         * The first time around, we'll re-adjust the u.Std.u64NextTS to
         * try prevent some jittering if we were started at a bad time.
         */
        const uint64_t  iTick       = ++pSubTimer->iTick;
        unsigned long   uCurJiffies = jiffies;
        unsigned long   ulNextJiffies;
        uint64_t        u64NanoInterval;
        unsigned long   cJiffies;
        unsigned long   flFlags;

        spin_lock_irqsave(&pTimer->ChgIntLock, flFlags);
        u64NanoInterval = pTimer->u64NanoInterval;
        cJiffies        = pTimer->cJiffies;
        if (RT_UNLIKELY(pSubTimer->u.Std.fFirstAfterChg))
        {
            pSubTimer->u.Std.fFirstAfterChg = false;
            pSubTimer->u.Std.u64NextTS      = RTTimeSystemNanoTS();
            pSubTimer->u.Std.nsStartTS      = pSubTimer->u.Std.u64NextTS - u64NanoInterval * (iTick - 1);
            pSubTimer->u.Std.ulNextJiffies  = uCurJiffies = jiffies;
        }
        spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags);

        pSubTimer->u.Std.u64NextTS += u64NanoInterval;
        if (cJiffies)
        {
            ulNextJiffies = pSubTimer->u.Std.ulNextJiffies + cJiffies;
            pSubTimer->u.Std.ulNextJiffies = ulNextJiffies;
            if (time_after_eq(ulNextJiffies, uCurJiffies))
            { /* likely */ }
            else
            {
                unsigned long cJiffiesBehind = uCurJiffies - ulNextJiffies;
                ulNextJiffies = uCurJiffies + cJiffies / 2;
                if (cJiffiesBehind >= HZ / 4) /* Conside if we're lagging too far behind.  Screw the u64NextTS member. */
                    pSubTimer->u.Std.ulNextJiffies = ulNextJiffies;
                /*else: Don't update u.Std.ulNextJiffies so we can continue catching up in the next tick. */
            }
        }
        else
        {
            const uint64_t u64NanoTS = RTTimeSystemNanoTS();
            const int64_t  cNsBehind = u64NanoTS - pSubTimer->u.Std.u64NextTS;
            if (cNsBehind <= 0)
                ulNextJiffies = uCurJiffies + rtTimerLnxNanoToJiffies(pSubTimer->u.Std.u64NextTS - u64NanoTS);
            else if (u64NanoInterval >= RT_NS_1SEC_64 * 2 / HZ)
            {
                ulNextJiffies = uCurJiffies + rtTimerLnxNanoToJiffies(u64NanoInterval / 2);
                if (cNsBehind >= RT_NS_1SEC_64 / HZ / 4) /* Conside if we're lagging too far behind. */
                    pSubTimer->u.Std.u64NextTS = u64NanoTS + u64NanoInterval / 2;
            }
            else
            {
                ulNextJiffies = uCurJiffies + 1;
                if (cNsBehind >= RT_NS_1SEC_64 / HZ / 4) /* Conside if we're lagging too far behind. */
                    pSubTimer->u.Std.u64NextTS = u64NanoTS + RT_NS_1SEC_64 / HZ;
            }
            pSubTimer->u.Std.ulNextJiffies = ulNextJiffies;
        }

        /*
         * Run the timer and re-arm it unless the state changed                                                                                                        .
         *                                                                                                                                                             .
         * We must re-arm it afterwards as we're not in a position to undo this                                                                                        .
         * operation if for instance someone stopped or destroyed us while we                                                                                          .
         * were in the callback.  (Linux takes care of any races here.)
         */
        pTimer->pfnTimer(pTimer, pTimer->pvUser, iTick);
        if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CALLBACK)))
        {
#ifdef CONFIG_SMP
            if (pTimer->fSpecificCpu || pTimer->fAllCpus)
            {
# if RTLNX_VER_MIN(4,8,0)
                mod_timer(&pSubTimer->u.Std.LnxTimer, ulNextJiffies);
# else
                mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, ulNextJiffies);
# endif
            }
            else
#endif
                mod_timer(&pSubTimer->u.Std.LnxTimer, ulNextJiffies);
            return;
        }
    }
    else
    {
        /*
         * One shot timer, stop it before dispatching it.
         * Allow RTTimerStart as well as RTTimerDestroy to be called from
         * the callback.
         */
        ASMAtomicWriteBool(&pTimer->fSuspended, true);
        pTimer->pfnTimer(pTimer, pTimer->pvUser, ++pSubTimer->iTick);
        if (RT_LIKELY(rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CALLBACK)))
            return;
    }

    /*
     * Some state change occurred while we were in the callback routine.
     */
    for (;;)
    {
        RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pSubTimer->enmState);
        switch (enmState)
        {
            case RTTIMERLNXSTATE_CB_DESTROYING:
                rtTimerLnxCallbackDestroy(pTimer, pSubTimer);
                return;

            case RTTIMERLNXSTATE_CB_STOPPING:
                if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_CB_STOPPING))
                    return;
                break;

            case RTTIMERLNXSTATE_CB_RESTARTING:
                if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_ACTIVE, RTTIMERLNXSTATE_CB_RESTARTING))
                {
                    uint64_t        u64NanoTS;
                    uint64_t        u64NextTS;
                    unsigned long   flFlags;

                    spin_lock_irqsave(&pTimer->ChgIntLock, flFlags);
                    u64NextTS = pSubTimer->uNsRestartAt;
                    u64NanoTS = RTTimeSystemNanoTS();
                    pSubTimer->iTick                = 0;
                    pSubTimer->u.Std.u64NextTS      = u64NextTS;
                    pSubTimer->u.Std.fFirstAfterChg = true;
                    pSubTimer->u.Std.ulNextJiffies  = u64NextTS > u64NanoTS
                                                    ? jiffies + rtTimerLnxNanoToJiffies(u64NextTS - u64NanoTS)
                                                    : jiffies;
                    spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags);

#ifdef CONFIG_SMP
                    if (pTimer->fSpecificCpu || pTimer->fAllCpus)
                    {
# if RTLNX_VER_MIN(4,8,0)
                        mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
# else
                        mod_timer_pinned(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
# endif
                    }
                    else
#endif
                        mod_timer(&pSubTimer->u.Std.LnxTimer, pSubTimer->u.Std.ulNextJiffies);
                    return;
                }
                break;

            default:
                AssertMsgFailed(("%d\n", enmState));
                return;
        }
        ASMNopPause();
    }
}


#ifdef CONFIG_SMP

/**
 * Per-cpu callback function (RTMpOnAll/RTMpOnSpecific).
 *
 * @param   idCpu       The current CPU.
 * @param   pvUser1     Pointer to the timer.
 * @param   pvUser2     Pointer to the argument structure.
 */
static DECLCALLBACK(void) rtTimerLnxStartAllOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
{
    PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2;
    PRTTIMER pTimer = (PRTTIMER)pvUser1;
    Assert(idCpu < pTimer->cCpus);
    rtTimerLnxStartSubTimer(&pTimer->aSubTimers[idCpu], pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes);
}


/**
 * Worker for RTTimerStart() that takes care of the ugly bits.
 *
 * @returns RTTimerStart() return value.
 * @param   pTimer      The timer.
 * @param   pArgs       The argument structure.
 */
static int rtTimerLnxOmniStart(PRTTIMER pTimer, PRTTIMERLINUXSTARTONCPUARGS pArgs)
{
    RTCPUID         iCpu;
    RTCPUSET        OnlineSet;
    RTCPUSET        OnlineSet2;
    int             rc2;

    /*
     * Prepare all the sub-timers for the startup and then flag the timer
     * as a whole as non-suspended, make sure we get them all before
     * clearing fSuspended as the MP handler will be waiting on this
     * should something happen while we're looping.
     */
    RTSpinlockAcquire(pTimer->hSpinlock);

    /* Just make it a omni timer restriction that no stop/start races are allowed. */
    for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
        if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) != RTTIMERLNXSTATE_STOPPED)
        {
            RTSpinlockRelease(pTimer->hSpinlock);
            return VERR_TIMER_BUSY;
        }

    do
    {
        RTMpGetOnlineSet(&OnlineSet);
        for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
        {
            Assert(pTimer->aSubTimers[iCpu].enmState != RTTIMERLNXSTATE_MP_STOPPING);
            rtTimerLnxSetState(&pTimer->aSubTimers[iCpu].enmState,
                               RTCpuSetIsMember(&OnlineSet, iCpu)
                               ? RTTIMERLNXSTATE_STARTING
                               : RTTIMERLNXSTATE_STOPPED);
        }
    } while (!RTCpuSetIsEqual(&OnlineSet, RTMpGetOnlineSet(&OnlineSet2)));

    ASMAtomicWriteBool(&pTimer->fSuspended, false);

    RTSpinlockRelease(pTimer->hSpinlock);

    /*
     * Start them (can't find any exported function that allows me to
     * do this without the cross calls).
     */
    pArgs->u64Now = RTTimeSystemNanoTS();
    rc2 = RTMpOnAll(rtTimerLnxStartAllOnCpu, pTimer, pArgs);
    AssertRC(rc2); /* screw this if it fails. */

    /*
     * Reset the sub-timers who didn't start up (ALL CPUs case).
     */
    RTSpinlockAcquire(pTimer->hSpinlock);

    for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
        if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_STOPPED, RTTIMERLNXSTATE_STARTING))
        {
            /** @todo very odd case for a rainy day. Cpus that temporarily went offline while
             * we were between calls needs to nudged as the MP handler will ignore events for
             * them because of the STARTING state. This is an extremely unlikely case - not that
             * that means anything in my experience... ;-) */
            RTTIMERLNX_LOG(("what!? iCpu=%u -> didn't start\n", iCpu));
        }

    RTSpinlockRelease(pTimer->hSpinlock);

    return VINF_SUCCESS;
}


/**
 * Worker for RTTimerStop() that takes care of the ugly SMP bits.
 *
 * @returns true if there was any active callbacks, false if not.
 * @param   pTimer      The timer (valid).
 * @param   fForDestroy Whether this is for RTTimerDestroy or not.
 */
static bool rtTimerLnxOmniStop(PRTTIMER pTimer, bool fForDestroy)
{
    bool            fActiveCallbacks = false;
    RTCPUID         iCpu;
    RTTIMERLNXSTATE enmState;


    /*
     * Mark the timer as suspended and flag all timers as stopping, except
     * for those being stopped by an MP event.
     */
    RTSpinlockAcquire(pTimer->hSpinlock);

    ASMAtomicWriteBool(&pTimer->fSuspended, true);
    for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
    {
        for (;;)
        {
            enmState = rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState);
            if (    enmState == RTTIMERLNXSTATE_STOPPED
                ||  enmState == RTTIMERLNXSTATE_MP_STOPPING)
                break;
            if (   enmState == RTTIMERLNXSTATE_CALLBACK
                || enmState == RTTIMERLNXSTATE_CB_STOPPING
                || enmState == RTTIMERLNXSTATE_CB_RESTARTING)
            {
                Assert(enmState != RTTIMERLNXSTATE_CB_STOPPING || fForDestroy);
                if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState,
                                           !fForDestroy ? RTTIMERLNXSTATE_CB_STOPPING : RTTIMERLNXSTATE_CB_DESTROYING,
                                           enmState))
                {
                    fActiveCallbacks = true;
                    break;
                }
            }
            else
            {
                Assert(enmState == RTTIMERLNXSTATE_ACTIVE);
                if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_STOPPING, enmState))
                    break;
            }
            ASMNopPause();
        }
    }

    RTSpinlockRelease(pTimer->hSpinlock);

    /*
     * Do the actual stopping. Fortunately, this doesn't require any IPIs.
     * Unfortunately it cannot be done synchronously.
     */
    for (iCpu = 0; iCpu < pTimer->cCpus; iCpu++)
        if (rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState) == RTTIMERLNXSTATE_STOPPING)
            rtTimerLnxStopSubTimer(&pTimer->aSubTimers[iCpu], pTimer->fHighRes);

    return fActiveCallbacks;
}


/**
 * Per-cpu callback function (RTMpOnSpecific) used by rtTimerLinuxMpEvent()
 * to start a sub-timer on a cpu that just have come online.
 *
 * @param   idCpu       The current CPU.
 * @param   pvUser1     Pointer to the timer.
 * @param   pvUser2     Pointer to the argument structure.
 */
static DECLCALLBACK(void) rtTimerLinuxMpStartOnCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
{
    PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2;
    PRTTIMER pTimer = (PRTTIMER)pvUser1;
    RTSPINLOCK hSpinlock;
    Assert(idCpu < pTimer->cCpus);

    /*
     * We have to be kind of careful here as we might be racing RTTimerStop
     * (and/or RTTimerDestroy, thus the paranoia.
     */
    hSpinlock = pTimer->hSpinlock;
    if (    hSpinlock != NIL_RTSPINLOCK
        &&  pTimer->u32Magic == RTTIMER_MAGIC)
    {
        RTSpinlockAcquire(hSpinlock);

        if (    !ASMAtomicUoReadBool(&pTimer->fSuspended)
            &&  pTimer->u32Magic == RTTIMER_MAGIC)
        {
            /* We're sane and the timer is not suspended yet. */
            PRTTIMERLNXSUBTIMER pSubTimer = &pTimer->aSubTimers[idCpu];
            if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STARTING, RTTIMERLNXSTATE_STOPPED))
                rtTimerLnxStartSubTimer(pSubTimer, pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes);
        }

        RTSpinlockRelease(hSpinlock);
    }
}


/**
 * MP event notification callback.
 *
 * @param   enmEvent    The event.
 * @param   idCpu       The cpu it applies to.
 * @param   pvUser      The timer.
 */
static DECLCALLBACK(void) rtTimerLinuxMpEvent(RTMPEVENT enmEvent, RTCPUID idCpu, void *pvUser)
{
    PRTTIMER            pTimer    = (PRTTIMER)pvUser;
    PRTTIMERLNXSUBTIMER pSubTimer = &pTimer->aSubTimers[idCpu];
    RTSPINLOCK          hSpinlock;

    Assert(idCpu < pTimer->cCpus);

    /*
     * Some initial paranoia.
     */
    if (pTimer->u32Magic != RTTIMER_MAGIC)
        return;
    hSpinlock = pTimer->hSpinlock;
    if (hSpinlock == NIL_RTSPINLOCK)
        return;

    RTSpinlockAcquire(hSpinlock);

    /* Is it active? */
    if (    !ASMAtomicUoReadBool(&pTimer->fSuspended)
        &&  pTimer->u32Magic == RTTIMER_MAGIC)
    {
        switch (enmEvent)
        {
            /*
             * Try do it without leaving the spin lock, but if we have to, retake it
             * when we're on the right cpu.
             */
            case RTMPEVENT_ONLINE:
                if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STARTING, RTTIMERLNXSTATE_STOPPED))
                {
                    RTTIMERLINUXSTARTONCPUARGS Args;
                    Args.u64Now = RTTimeSystemNanoTS();
                    Args.u64First = 0;

                    if (RTMpCpuId() == idCpu)
                        rtTimerLnxStartSubTimer(pSubTimer, Args.u64Now, Args.u64First, true /*fPinned*/, pTimer->fHighRes);
                    else
                    {
                        rtTimerLnxSetState(&pSubTimer->enmState, RTTIMERLNXSTATE_STOPPED); /* we'll recheck it. */
                        RTSpinlockRelease(hSpinlock);

                        RTMpOnSpecific(idCpu, rtTimerLinuxMpStartOnCpu, pTimer, &Args);
                        return; /* we've left the spinlock */
                    }
                }
                break;

            /*
             * The CPU is (going) offline, make sure the sub-timer is stopped.
             *
             * Linux will migrate it to a different CPU, but we don't want this. The
             * timer function is checking for this.
             */
            case RTMPEVENT_OFFLINE:
            {
                RTTIMERLNXSTATE enmState;
                while (   (enmState = rtTimerLnxGetState(&pSubTimer->enmState)) == RTTIMERLNXSTATE_ACTIVE
                       || enmState == RTTIMERLNXSTATE_CALLBACK
                       || enmState == RTTIMERLNXSTATE_CB_RESTARTING)
                {
                    if (enmState == RTTIMERLNXSTATE_ACTIVE)
                    {
                        if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_MP_STOPPING, RTTIMERLNXSTATE_ACTIVE))
                        {
                            RTSpinlockRelease(hSpinlock);

                            rtTimerLnxStopSubTimer(pSubTimer, pTimer->fHighRes);
                            return; /* we've left the spinlock */
                        }
                    }
                    else if (rtTimerLnxCmpXchgState(&pSubTimer->enmState, RTTIMERLNXSTATE_CB_STOPPING, enmState))
                        break;

                    /* State not stable, try again. */
                    ASMNopPause();
                }
                break;
            }
        }
    }

    RTSpinlockRelease(hSpinlock);
}

#endif /* CONFIG_SMP */


/**
 * Callback function use by RTTimerStart via RTMpOnSpecific to start a timer
 * running on a specific CPU.
 *
 * @param   idCpu       The current CPU.
 * @param   pvUser1     Pointer to the timer.
 * @param   pvUser2     Pointer to the argument structure.
 */
static DECLCALLBACK(void) rtTimerLnxStartOnSpecificCpu(RTCPUID idCpu, void *pvUser1, void *pvUser2)
{
    PRTTIMERLINUXSTARTONCPUARGS pArgs = (PRTTIMERLINUXSTARTONCPUARGS)pvUser2;
    PRTTIMER pTimer = (PRTTIMER)pvUser1;
    RT_NOREF_PV(idCpu);
    rtTimerLnxStartSubTimer(&pTimer->aSubTimers[0], pArgs->u64Now, pArgs->u64First, true /*fPinned*/, pTimer->fHighRes);
}


RTDECL(int) RTTimerStart(PRTTIMER pTimer, uint64_t u64First)
{
    RTTIMERLINUXSTARTONCPUARGS Args;
    int rc2;
    IPRT_LINUX_SAVE_EFL_AC();

    /*
     * Validate.
     */
    AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
    AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);

    if (!ASMAtomicUoReadBool(&pTimer->fSuspended))
        return VERR_TIMER_ACTIVE;
    RTTIMERLNX_LOG(("start %p cCpus=%d\n", pTimer, pTimer->cCpus));

    Args.u64First = u64First;
#ifdef CONFIG_SMP
    /*
     * Omni timer?
     */
    if (pTimer->fAllCpus)
    {
        rc2 = rtTimerLnxOmniStart(pTimer, &Args);
        IPRT_LINUX_RESTORE_EFL_AC();
        return rc2;
    }
#endif

    /*
     * Simple timer - Pretty straight forward if it wasn't for restarting.
     */
    Args.u64Now = RTTimeSystemNanoTS();
    ASMAtomicWriteU64(&pTimer->aSubTimers[0].uNsRestartAt, Args.u64Now + u64First);
    for (;;)
    {
        RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[0].enmState);
        switch (enmState)
        {
            case RTTIMERLNXSTATE_STOPPED:
                if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STARTING, RTTIMERLNXSTATE_STOPPED))
                {
                    ASMAtomicWriteBool(&pTimer->fSuspended, false);
                    if (!pTimer->fSpecificCpu)
                        rtTimerLnxStartSubTimer(&pTimer->aSubTimers[0], Args.u64Now, Args.u64First,
                                                false /*fPinned*/, pTimer->fHighRes);
                    else
                    {
                        rc2 = RTMpOnSpecific(pTimer->idCpu, rtTimerLnxStartOnSpecificCpu, pTimer, &Args);
                        if (RT_FAILURE(rc2))
                        {
                            /* Suspend it, the cpu id is probably invalid or offline. */
                            ASMAtomicWriteBool(&pTimer->fSuspended, true);
                            rtTimerLnxSetState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STOPPED);
                            return rc2;
                        }
                    }
                    IPRT_LINUX_RESTORE_EFL_AC();
                    return VINF_SUCCESS;
                }
                break;

            case RTTIMERLNXSTATE_CALLBACK:
            case RTTIMERLNXSTATE_CB_STOPPING:
                if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_CB_RESTARTING, enmState))
                {
                    ASMAtomicWriteBool(&pTimer->fSuspended, false);
                    IPRT_LINUX_RESTORE_EFL_AC();
                    return VINF_SUCCESS;
                }
                break;

            default:
                AssertMsgFailed(("%d\n", enmState));
                IPRT_LINUX_RESTORE_EFL_AC();
                return VERR_INTERNAL_ERROR_4;
        }
        ASMNopPause();
    }
}
RT_EXPORT_SYMBOL(RTTimerStart);


/**
 * Common worker for RTTimerStop and RTTimerDestroy.
 *
 * @returns true if there was any active callbacks, false if not.
 * @param   pTimer              The timer to stop.
 * @param   fForDestroy         Whether it's RTTimerDestroy calling or not.
 */
static bool rtTimerLnxStop(PRTTIMER pTimer, bool fForDestroy)
{
    RTTIMERLNX_LOG(("lnxstop %p %d\n", pTimer, fForDestroy));
#ifdef CONFIG_SMP
    /*
     * Omni timer?
     */
    if (pTimer->fAllCpus)
        return rtTimerLnxOmniStop(pTimer, fForDestroy);
#endif

    /*
     * Simple timer.
     */
    ASMAtomicWriteBool(&pTimer->fSuspended, true);
    for (;;)
    {
        RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[0].enmState);
        switch (enmState)
        {
            case RTTIMERLNXSTATE_ACTIVE:
                if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState, RTTIMERLNXSTATE_STOPPING, RTTIMERLNXSTATE_ACTIVE))
                {
                    rtTimerLnxStopSubTimer(&pTimer->aSubTimers[0], pTimer->fHighRes);
                    return false;
                }
                break;

            case RTTIMERLNXSTATE_CALLBACK:
            case RTTIMERLNXSTATE_CB_RESTARTING:
            case RTTIMERLNXSTATE_CB_STOPPING:
                Assert(enmState != RTTIMERLNXSTATE_CB_STOPPING || fForDestroy);
                if (rtTimerLnxCmpXchgState(&pTimer->aSubTimers[0].enmState,
                                           !fForDestroy ? RTTIMERLNXSTATE_CB_STOPPING : RTTIMERLNXSTATE_CB_DESTROYING,
                                           enmState))
                    return true;
                break;

            case RTTIMERLNXSTATE_STOPPED:
                return VINF_SUCCESS;

            case RTTIMERLNXSTATE_CB_DESTROYING:
                AssertMsgFailed(("enmState=%d pTimer=%p\n", enmState, pTimer));
                return true;

            default:
            case RTTIMERLNXSTATE_STARTING:
            case RTTIMERLNXSTATE_MP_STARTING:
            case RTTIMERLNXSTATE_STOPPING:
            case RTTIMERLNXSTATE_MP_STOPPING:
                AssertMsgFailed(("enmState=%d pTimer=%p\n", enmState, pTimer));
                return false;
        }

        /* State not stable, try again. */
        ASMNopPause();
    }
}


RTDECL(int) RTTimerStop(PRTTIMER pTimer)
{
    /*
     * Validate.
     */
    IPRT_LINUX_SAVE_EFL_AC();
    AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
    AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);
    RTTIMERLNX_LOG(("stop %p\n", pTimer));

    if (ASMAtomicUoReadBool(&pTimer->fSuspended))
        return VERR_TIMER_SUSPENDED;

    rtTimerLnxStop(pTimer, false /*fForDestroy*/);

    IPRT_LINUX_RESTORE_EFL_AC();
    return VINF_SUCCESS;
}
RT_EXPORT_SYMBOL(RTTimerStop);


RTDECL(int) RTTimerChangeInterval(PRTTIMER pTimer, uint64_t u64NanoInterval)
{
    unsigned long cJiffies;
    unsigned long flFlags;
    IPRT_LINUX_SAVE_EFL_AC();

    /*
     * Validate.
     */
    AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
    AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);
    AssertReturn(u64NanoInterval, VERR_INVALID_PARAMETER);
    AssertReturn(u64NanoInterval < UINT64_MAX / 8, VERR_INVALID_PARAMETER);
    AssertReturn(pTimer->u64NanoInterval, VERR_INVALID_STATE);
    RTTIMERLNX_LOG(("change %p %llu\n", pTimer, u64NanoInterval));

#ifdef RTTIMER_LINUX_WITH_HRTIMER
    /*
     * For the high resolution timers it is easy since we don't care so much
     * about when it is applied to the sub-timers.
     */
    if (pTimer->fHighRes)
    {
        ASMAtomicWriteU64(&pTimer->u64NanoInterval, u64NanoInterval);
        IPRT_LINUX_RESTORE_EFL_AC();
        return VINF_SUCCESS;
    }
#endif

    /*
     * Standard timers have a bit more complicated way of calculating
     * their interval and such. So, forget omni timers for now.
     */
    if (pTimer->cCpus > 1)
        return VERR_NOT_SUPPORTED;

    cJiffies = u64NanoInterval / (RT_NS_1SEC / HZ);
    if (cJiffies * (RT_NS_1SEC / HZ) != u64NanoInterval)
        cJiffies = 0;

    spin_lock_irqsave(&pTimer->ChgIntLock, flFlags);
    pTimer->aSubTimers[0].u.Std.fFirstAfterChg = true;
    pTimer->cJiffies = cJiffies;
    ASMAtomicWriteU64(&pTimer->u64NanoInterval, u64NanoInterval);
    spin_unlock_irqrestore(&pTimer->ChgIntLock, flFlags);
    IPRT_LINUX_RESTORE_EFL_AC();
    return VINF_SUCCESS;
}
RT_EXPORT_SYMBOL(RTTimerChangeInterval);


RTDECL(int) RTTimerDestroy(PRTTIMER pTimer)
{
    bool fCanDestroy;
    IPRT_LINUX_SAVE_EFL_AC();

    /*
     * Validate. It's ok to pass NULL pointer.
     */
    if (pTimer == /*NIL_RTTIMER*/ NULL)
        return VINF_SUCCESS;
    AssertPtrReturn(pTimer, VERR_INVALID_HANDLE);
    AssertReturn(pTimer->u32Magic == RTTIMER_MAGIC, VERR_INVALID_HANDLE);
    RTTIMERLNX_LOG(("destroy %p\n", pTimer));
/** @todo We should invalidate the magic here! */

    /*
     * Stop the timer if it's still active, then destroy it if we can.
     */
    if (!ASMAtomicUoReadBool(&pTimer->fSuspended))
        fCanDestroy = rtTimerLnxStop(pTimer, true /*fForDestroy*/);
    else
    {
        uint32_t        iCpu = pTimer->cCpus;
        if (pTimer->cCpus > 1)
            RTSpinlockAcquire(pTimer->hSpinlock);

        fCanDestroy = true;
        while (iCpu-- > 0)
        {
            for (;;)
            {
                RTTIMERLNXSTATE enmState = rtTimerLnxGetState(&pTimer->aSubTimers[iCpu].enmState);
                switch (enmState)
                {
                    case RTTIMERLNXSTATE_CALLBACK:
                    case RTTIMERLNXSTATE_CB_RESTARTING:
                    case RTTIMERLNXSTATE_CB_STOPPING:
                        if (!rtTimerLnxCmpXchgState(&pTimer->aSubTimers[iCpu].enmState, RTTIMERLNXSTATE_CB_DESTROYING, enmState))
                            continue;
                        fCanDestroy = false;
                        break;

                    case RTTIMERLNXSTATE_CB_DESTROYING:
                        AssertMsgFailed(("%d\n", enmState));
                        fCanDestroy = false;
                        break;
                    default:
                        break;
                }
                break;
            }
        }

        if (pTimer->cCpus > 1)
            RTSpinlockRelease(pTimer->hSpinlock);
    }

    if (fCanDestroy)
    {
        /* For paranoid reasons, defer actually destroying the semaphore when
           in atomic or interrupt context. */
#if RTLNX_VER_MIN(2,5,32)
        if (in_atomic() || in_interrupt())
#else
        if (in_interrupt())
#endif
            rtR0LnxWorkqueuePush(&pTimer->DtorWorkqueueItem, rtTimerLnxDestroyDeferred);
        else
            rtTimerLnxDestroyIt(pTimer);
    }

    IPRT_LINUX_RESTORE_EFL_AC();
    return VINF_SUCCESS;
}
RT_EXPORT_SYMBOL(RTTimerDestroy);


RTDECL(int) RTTimerCreateEx(PRTTIMER *ppTimer, uint64_t u64NanoInterval, uint32_t fFlags, PFNRTTIMER pfnTimer, void *pvUser)
{
    PRTTIMER    pTimer;
    RTCPUID     iCpu;
    unsigned    cCpus;
    int         rc;
    IPRT_LINUX_SAVE_EFL_AC();

    rtR0LnxWorkqueueFlush();                /* for 2.4 */
    *ppTimer = NULL;

    /*
     * Validate flags.
     */
    if (!RTTIMER_FLAGS_ARE_VALID(fFlags))
    {
        IPRT_LINUX_RESTORE_EFL_AC();
        return VERR_INVALID_PARAMETER;
    }
    if (    (fFlags & RTTIMER_FLAGS_CPU_SPECIFIC)
        &&  (fFlags & RTTIMER_FLAGS_CPU_ALL) != RTTIMER_FLAGS_CPU_ALL
        &&  !RTMpIsCpuPossible(RTMpCpuIdFromSetIndex(fFlags & RTTIMER_FLAGS_CPU_MASK)))
    {
        IPRT_LINUX_RESTORE_EFL_AC();
        return VERR_CPU_NOT_FOUND;
    }

    /*
     * Allocate the timer handler.
     */
    cCpus = 1;
#ifdef CONFIG_SMP
    if ((fFlags & RTTIMER_FLAGS_CPU_ALL) == RTTIMER_FLAGS_CPU_ALL)
    {
        cCpus = RTMpGetMaxCpuId() + 1;
        Assert(cCpus <= RTCPUSET_MAX_CPUS); /* On linux we have a 1:1 relationship between cpuid and set index. */
        AssertReturnStmt(u64NanoInterval, IPRT_LINUX_RESTORE_EFL_AC(), VERR_NOT_IMPLEMENTED); /* We don't implement single shot on all cpus, sorry. */
    }
#endif

    rc = RTMemAllocEx(RT_UOFFSETOF_DYN(RTTIMER, aSubTimers[cCpus]), 0,
                      RTMEMALLOCEX_FLAGS_ZEROED | RTMEMALLOCEX_FLAGS_ANY_CTX_FREE, (void **)&pTimer);
    if (RT_FAILURE(rc))
    {
        IPRT_LINUX_RESTORE_EFL_AC();
        return rc;
    }

    /*
     * Initialize it.
     */
    pTimer->u32Magic        = RTTIMER_MAGIC;
    pTimer->hSpinlock       = NIL_RTSPINLOCK;
    pTimer->fSuspended      = true;
    pTimer->fHighRes        = !!(fFlags & RTTIMER_FLAGS_HIGH_RES);
#ifdef CONFIG_SMP
    pTimer->fSpecificCpu    = (fFlags & RTTIMER_FLAGS_CPU_SPECIFIC) && (fFlags & RTTIMER_FLAGS_CPU_ALL) != RTTIMER_FLAGS_CPU_ALL;
    pTimer->fAllCpus        = (fFlags & RTTIMER_FLAGS_CPU_ALL) == RTTIMER_FLAGS_CPU_ALL;
    pTimer->idCpu           = pTimer->fSpecificCpu
                            ? RTMpCpuIdFromSetIndex(fFlags & RTTIMER_FLAGS_CPU_MASK)
                            : NIL_RTCPUID;
#else
    pTimer->fSpecificCpu    = !!(fFlags & RTTIMER_FLAGS_CPU_SPECIFIC);
    pTimer->idCpu           = RTMpCpuId();
#endif
    pTimer->cCpus           = cCpus;
    pTimer->pfnTimer        = pfnTimer;
    pTimer->pvUser          = pvUser;
    pTimer->u64NanoInterval = u64NanoInterval;
    pTimer->cJiffies        = u64NanoInterval / (RT_NS_1SEC / HZ);
    if (pTimer->cJiffies * (RT_NS_1SEC / HZ) != u64NanoInterval)
        pTimer->cJiffies    = 0;
    spin_lock_init(&pTimer->ChgIntLock);

    for (iCpu = 0; iCpu < cCpus; iCpu++)
    {
#ifdef RTTIMER_LINUX_WITH_HRTIMER
        if (pTimer->fHighRes)
        {
            hrtimer_init(&pTimer->aSubTimers[iCpu].u.Hr.LnxTimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
            pTimer->aSubTimers[iCpu].u.Hr.LnxTimer.function     = rtTimerLinuxHrCallback;
        }
        else
#endif
        {
#if RTLNX_VER_MIN(4,15,0)
            timer_setup(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer, rtTimerLinuxStdCallback, TIMER_PINNED);
#elif RTLNX_VER_MIN(4,8,0)
            init_timer_pinned(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer);
#else
            init_timer(&pTimer->aSubTimers[iCpu].u.Std.LnxTimer);
#endif
#if RTLNX_VER_MAX(4,15,0)
            pTimer->aSubTimers[iCpu].u.Std.LnxTimer.data        = (unsigned long)&pTimer->aSubTimers[iCpu];
            pTimer->aSubTimers[iCpu].u.Std.LnxTimer.function    = rtTimerLinuxStdCallback;
#endif
            pTimer->aSubTimers[iCpu].u.Std.LnxTimer.expires     = jiffies;
            pTimer->aSubTimers[iCpu].u.Std.u64NextTS            = 0;
        }
        pTimer->aSubTimers[iCpu].iTick      = 0;
        pTimer->aSubTimers[iCpu].pParent    = pTimer;
        pTimer->aSubTimers[iCpu].enmState   = RTTIMERLNXSTATE_STOPPED;
    }

#ifdef CONFIG_SMP
    /*
     * If this is running on ALL cpus, we'll have to register a callback
     * for MP events (so timers can be started/stopped on cpus going
     * online/offline). We also create the spinlock for synchronizing
     * stop/start/mp-event.
     */
    if (cCpus > 1)
    {
        int rc = RTSpinlockCreate(&pTimer->hSpinlock, RTSPINLOCK_FLAGS_INTERRUPT_SAFE, "RTTimerLnx");
        if (RT_SUCCESS(rc))
            rc = RTMpNotificationRegister(rtTimerLinuxMpEvent, pTimer);
        else
            pTimer->hSpinlock = NIL_RTSPINLOCK;
        if (RT_FAILURE(rc))
        {
            RTTimerDestroy(pTimer);
            IPRT_LINUX_RESTORE_EFL_AC();
            return rc;
        }
    }
#endif /* CONFIG_SMP */

    RTTIMERLNX_LOG(("create %p hires=%d fFlags=%#x cCpus=%u\n", pTimer, pTimer->fHighRes, fFlags, cCpus));
    *ppTimer = pTimer;
    IPRT_LINUX_RESTORE_EFL_AC();
    return VINF_SUCCESS;
}
RT_EXPORT_SYMBOL(RTTimerCreateEx);


RTDECL(uint32_t) RTTimerGetSystemGranularity(void)
{
#if 0 /** @todo Not sure if this is what we want or not... Add new API for
       *        querying the resolution of the high res timers? */
    struct timespec Ts;
    int rc;
    IPRT_LINUX_SAVE_EFL_AC();
    rc = hrtimer_get_res(CLOCK_MONOTONIC, &Ts);
    IPRT_LINUX_RESTORE_EFL_AC();
    if (!rc)
    {
        Assert(!Ts.tv_sec);
        return Ts.tv_nsec;
    }
#endif
    /* */
#if RTLNX_VER_MAX(4,9,0) || RTLNX_VER_MIN(4,13,0)
    /* On 4.9, 4.10 and 4.12 we've observed tstRTR0Timer failures of the omni timer tests
       where we get about half of the ticks we want.  The failing test is using this value
       as interval.  So, this is a very very crude hack to try make omni timers work
       correctly without actually knowing what's going wrong... */
    return RT_NS_1SEC * 2 / HZ; /* ns */
#else
    return RT_NS_1SEC / HZ; /* ns */
#endif
}
RT_EXPORT_SYMBOL(RTTimerGetSystemGranularity);


RTDECL(int) RTTimerRequestSystemGranularity(uint32_t u32Request, uint32_t *pu32Granted)
{
    RT_NOREF_PV(u32Request); RT_NOREF_PV(*pu32Granted);
    return VERR_NOT_SUPPORTED;
}
RT_EXPORT_SYMBOL(RTTimerRequestSystemGranularity);


RTDECL(int) RTTimerReleaseSystemGranularity(uint32_t u32Granted)
{
    RT_NOREF_PV(u32Granted);
    return VERR_NOT_SUPPORTED;
}
RT_EXPORT_SYMBOL(RTTimerReleaseSystemGranularity);


RTDECL(bool) RTTimerCanDoHighResolution(void)
{
#ifdef RTTIMER_LINUX_WITH_HRTIMER
    return true;
#else
    return false;
#endif
}
RT_EXPORT_SYMBOL(RTTimerCanDoHighResolution);