#!/usr/bin/env bash
set -ex
set -o pipefail

at_exit() {
    if [ $? -ne 0 ]; then
        # We're exiting with a non-zero EC, let's dump test artifacts
        # for easier debugging
        [ -f "$straceLog" ] && cat "$straceLog"
        [ -f "$journalLog" ] && cat "$journalLog"
    fi
}

trap at_exit EXIT

systemd-analyze log-level debug
systemd-analyze log-target journal

# Log files
straceLog='strace.log'
journalLog='journal.log'

# Systemd config files
testUnit='numa-test.service'
testUnitFile="/run/systemd/system/$testUnit"
testUnitNUMAConf="$testUnitFile.d/numa.conf"

# Sleep constants (we should probably figure out something better but nothing comes to mind)
journalSleep=5
sleepAfterStart=1

# Journal cursor for easier navigation
journalCursorFile="jounalCursorFile"

startStrace() {
    coproc strace -qq -p 1 -o $straceLog -e set_mempolicy -s 1024 $1
    # Wait for strace to properly "initialize"
    sleep $sleepAfterStart
}

stopStrace() {
    kill -s TERM $COPROC_PID
    # Make sure the strace process is indeed dead
    while kill -0 $COPROC_PID 2>/dev/null; do sleep 0.1; done
}

startJournalctl() {
    # Save journal's cursor for later navigation
    journalctl --no-pager --cursor-file="$journalCursorFile" -n0 -ocat
}

stopJournalctl() {
    local unit="${1:-init.scope}"
    # Using journalctl --sync should be better than using SIGRTMIN+1, as
    # the --sync wait until the synchronization is complete
    echo "Force journald to write all queued messages"
    journalctl --sync
    journalctl -u $unit --cursor-file="$journalCursorFile" > "$journalLog"
}

checkNUMA() {
    # NUMA enabled system should have at least NUMA node0
    test -e /sys/devices/system/node/node0
}

writePID1NUMAPolicy() {
    echo [Manager] > $confDir/numa.conf
    echo NUMAPolicy=$1 >> $confDir/numa.conf
    echo NUMAMask=$2>> $confDir/numa.conf
}

writeTestUnit() {
    mkdir -p $testUnitFile.d/
    echo [Service] > $testUnitFile
    echo ExecStart=/bin/sleep 3600 >> $testUnitFile
}

writeTestUnitNUMAPolicy() {
    echo [Service] > $testUnitNUMAConf
    echo NUMAPolicy=$1 >> $testUnitNUMAConf
    echo NUMAMask=$2>> $testUnitNUMAConf
    systemctl daemon-reload
}

pid1ReloadWithStrace() {
    startStrace
    systemctl daemon-reload
    sleep $sleepAfterStart
    stopStrace
}

pid1ReloadWithJournal() {
    startJournalctl
    systemctl daemon-reload
    stopJournalctl
}

pid1StartUnitWithStrace() {
    startStrace '-f'
    systemctl start $1
    sleep $sleepAfterStart
    stopStrace
}

pid1StartUnitWithJournal() {
    startJournalctl
    systemctl start $1
    sleep $sleepAfterStart
    stopJournalctl
}

pid1StopUnit() {
    systemctl stop $1
}

systemctlCheckNUMAProperties() {
    local LOGFILE="$(mktemp)"
    systemctl show -p NUMAPolicy $1 > "$LOGFILE"
    grep "NUMAPolicy=$2" "$LOGFILE"

    > "$LOGFILE"

    if [ -n "$3" ]; then
        systemctl show -p NUMAMask $1 > "$LOGFILE"
        grep "NUMAMask=$3" "$LOGFILE"
    fi
}

writeTestUnit

# Create systemd config drop-in directory
confDir="/run/systemd/system.conf.d/"
mkdir -p "$confDir"

if ! checkNUMA; then
    echo >&2 "NUMA is not supported on this machine, switching to a simple sanity check"

    echo "PID1 NUMAPolicy=default && NUMAMask=0 check without NUMA support"
    writePID1NUMAPolicy "default" "0"
    startJournalctl
    systemctl daemon-reload
    stopJournalctl
    grep "NUMA support not available, ignoring" "$journalLog"

    echo "systemd-run NUMAPolicy=default && NUMAMask=0 check without NUMA support"
    runUnit='numa-systemd-run-test.service'
    startJournalctl
    systemd-run -p NUMAPolicy=default -p NUMAMask=0 --unit $runUnit sleep 1000
    sleep $sleepAfterStart
    pid1StopUnit $runUnit
    stopJournalctl $runUnit
    grep "NUMA support not available, ignoring" "$journalLog"

else
    echo "PID1 NUMAPolicy support - Default policy w/o mask"
    writePID1NUMAPolicy "default"
    pid1ReloadWithStrace
    # Kernel requires that nodemask argument is set to NULL when setting default policy
    grep "set_mempolicy(MPOL_DEFAULT, NULL" $straceLog

    echo "PID1 NUMAPolicy support - Default policy w/ mask"
    writePID1NUMAPolicy "default" "0"
    pid1ReloadWithStrace
    grep "set_mempolicy(MPOL_DEFAULT, NULL" $straceLog

    echo "PID1 NUMAPolicy support - Bind policy w/o mask"
    writePID1NUMAPolicy "bind"
    pid1ReloadWithJournal
    grep "Failed to set NUMA memory policy: Invalid argument" $journalLog

    echo "PID1 NUMAPolicy support - Bind policy w/ mask"
    writePID1NUMAPolicy "bind" "0"
    pid1ReloadWithStrace
    grep -P "set_mempolicy\(MPOL_BIND, \[0x0*1\]" $straceLog

    echo "PID1 NUMAPolicy support - Interleave policy w/o mask"
    writePID1NUMAPolicy "interleave"
    pid1ReloadWithJournal
    grep "Failed to set NUMA memory policy: Invalid argument" $journalLog

    echo "PID1 NUMAPolicy support - Interleave policy w/ mask"
    writePID1NUMAPolicy "interleave" "0"
    pid1ReloadWithStrace
    grep -P "set_mempolicy\(MPOL_INTERLEAVE, \[0x0*1\]" $straceLog

    echo "PID1 NUMAPolicy support - Preferred policy w/o mask"
    writePID1NUMAPolicy "preferred"
    pid1ReloadWithJournal
    # Preferred policy with empty node mask is actually allowed and should reset allocation policy to default
    ! grep "Failed to set NUMA memory policy: Invalid argument" $journalLog

    echo "PID1 NUMAPolicy support - Preferred policy w/ mask"
    writePID1NUMAPolicy "preferred" "0"
    pid1ReloadWithStrace
    grep -P "set_mempolicy\(MPOL_PREFERRED, \[0x0*1\]" $straceLog

    echo "PID1 NUMAPolicy support - Local policy w/o mask"
    writePID1NUMAPolicy "local"
    pid1ReloadWithStrace
    # Kernel requires that nodemask argument is set to NULL when setting default policy
    # The unpatched versions of strace don't recognize the MPOL_LOCAL constant and
    # return a numerical constant instead (with a comment):
    #   set_mempolicy(0x4 /* MPOL_??? */, NULL, 0) = 0
    # Let's cover this scenario as well
    grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" $straceLog

    echo "PID1 NUMAPolicy support - Local policy w/ mask"
    writePID1NUMAPolicy "local" "0"
    pid1ReloadWithStrace
    grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" $straceLog

    echo "Unit file NUMAPolicy support - Default policy w/o mask"
    writeTestUnitNUMAPolicy "default"
    pid1StartUnitWithStrace $testUnit
    systemctlCheckNUMAProperties $testUnit "default"
    pid1StopUnit $testUnit
    grep "set_mempolicy(MPOL_DEFAULT, NULL" $straceLog

    echo "Unit file NUMAPolicy support - Default policy w/ mask"
    writeTestUnitNUMAPolicy "default" "0"
    pid1StartUnitWithStrace $testUnit
    systemctlCheckNUMAProperties $testUnit "default" "0"
    pid1StopUnit $testUnit
    # Maks must be ignored
    grep "set_mempolicy(MPOL_DEFAULT, NULL" $straceLog

    echo "Unit file NUMAPolicy support - Bind policy w/o mask"
    writeTestUnitNUMAPolicy "bind"
    pid1StartUnitWithJournal $testUnit
    pid1StopUnit $testUnit
    grep "numa-test.service: Main process exited, code=exited, status=242/NUMA" $journalLog

    echo "Unit file NUMAPolicy support - Bind policy w/ mask"
    writeTestUnitNUMAPolicy "bind" "0"
    pid1StartUnitWithStrace $testUnit
    systemctlCheckNUMAProperties $testUnit "bind" "0"
    pid1StopUnit $testUnit
    grep -P "set_mempolicy\(MPOL_BIND, \[0x0*1\]" $straceLog

    echo "Unit file NUMAPolicy support - Interleave policy w/o mask"
    writeTestUnitNUMAPolicy "interleave"
    pid1StartUnitWithStrace $testUnit
    pid1StopUnit $testUnit
    grep "numa-test.service: Main process exited, code=exited, status=242/NUMA" $journalLog

    echo "Unit file NUMAPolicy support - Interleave policy w/ mask"
    writeTestUnitNUMAPolicy "interleave" "0"
    pid1StartUnitWithStrace $testUnit
    systemctlCheckNUMAProperties $testUnit "interleave" "0"
    pid1StopUnit $testUnit
    grep -P "set_mempolicy\(MPOL_INTERLEAVE, \[0x0*1\]" $straceLog

    echo "Unit file NUMAPolicy support - Preferred policy w/o mask"
    writeTestUnitNUMAPolicy "preferred"
    pid1StartUnitWithJournal $testUnit
    systemctlCheckNUMAProperties $testUnit "preferred"
    pid1StopUnit $testUnit
    ! grep "numa-test.service: Main process exited, code=exited, status=242/NUMA" $journalLog

    echo "Unit file NUMAPolicy support - Preferred policy w/ mask"
    writeTestUnitNUMAPolicy "preferred" "0"
    pid1StartUnitWithStrace $testUnit
    systemctlCheckNUMAProperties $testUnit "preferred" "0"
    pid1StopUnit $testUnit
    grep -P "set_mempolicy\(MPOL_PREFERRED, \[0x0*1\]" $straceLog

    echo "Unit file NUMAPolicy support - Local policy w/o mask"
    writeTestUnitNUMAPolicy "local"
    pid1StartUnitWithStrace $testUnit
    systemctlCheckNUMAProperties $testUnit "local"
    pid1StopUnit $testUnit
    grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" $straceLog

    echo "Unit file NUMAPolicy support - Local policy w/ mask"
    writeTestUnitNUMAPolicy "local" "0"
    pid1StartUnitWithStrace $testUnit
    systemctlCheckNUMAProperties $testUnit "local" "0"
    pid1StopUnit $testUnit
    # Maks must be ignored
    grep -E "set_mempolicy\((MPOL_LOCAL|0x4 [^,]*), NULL" $straceLog

    echo "Unit file CPUAffinity=NUMA support"
    writeTestUnitNUMAPolicy "bind" "0"
    echo "CPUAffinity=numa" >> $testUnitNUMAConf
    systemctl daemon-reload
    systemctl start $testUnit
    systemctlCheckNUMAProperties $testUnit "bind" "0"
    pid=$(systemctl show --value -p MainPID $testUnit)
    cpulist=$(cat /sys/devices/system/node/node0/cpulist)
    affinity_systemd=$(systemctl show --value -p CPUAffinity $testUnit)
    [ $cpulist = $affinity_systemd ]
    pid1StopUnit $testUnit

    echo "systemd-run NUMAPolicy support"
    runUnit='numa-systemd-run-test.service'

    systemd-run -p NUMAPolicy=default --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "default"
    pid1StopUnit $runUnit

    systemd-run -p NUMAPolicy=default -p NUMAMask=0 --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "default" ""
    pid1StopUnit $runUnit

    systemd-run -p NUMAPolicy=bind -p NUMAMask=0 --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "bind" "0"
    pid1StopUnit $runUnit

    systemd-run -p NUMAPolicy=interleave -p NUMAMask=0 --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "interleave" "0"
    pid1StopUnit $runUnit

    systemd-run -p NUMAPolicy=preferred -p NUMAMask=0 --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "preferred" "0"
    pid1StopUnit $runUnit

    systemd-run -p NUMAPolicy=local --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "local"
    pid1StopUnit $runUnit

    systemd-run -p NUMAPolicy=local -p NUMAMask=0 --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "local" ""
    pid1StopUnit $runUnit

    systemd-run -p NUMAPolicy=local -p NUMAMask=0 -p CPUAffinity=numa --unit $runUnit sleep 1000
    systemctlCheckNUMAProperties $runUnit "local" ""
    systemctl cat $runUnit | grep -q 'CPUAffinity=numa'
    pid1StopUnit $runUnit

fi

# Cleanup
rm -rf $testDir
rm -rf $confDir
systemctl daemon-reload

systemd-analyze log-level info

echo OK > /testok

exit 0