1 files changed, 163 insertions, 0 deletions
diff --git a/linux-kernel-patches/12-bb8c13d61a629276a162c1d2b1a20a815cbcfbb7.patch b/linux-kernel-patches/12-bb8c13d61a629276a162c1d2b1a20a815cbcfbb7.patch
new file mode 100644
index 0000000..7baa44a
--- /dev/null
+++ b/linux-kernel-patches/12-bb8c13d61a629276a162c1d2b1a20a815cbcfbb7.patch
@@ -0,0 +1,163 @@
+From bb8c13d61a629276a162c1d2b1a20a815cbcfbb7 Mon Sep 17 00:00:00 2001
+From: Borislav Petkov <bp@suse.de>
+Date: Wed, 14 Mar 2018 19:36:15 +0100
+Subject: x86/microcode: Fix CPU synchronization routine
+
+Emanuel reported an issue with a hang during microcode update because my
+dumb idea to use one atomic synchronization variable for both rendezvous
+- before and after update - was simply bollocks:
+
+  microcode: microcode_reload_late: late_cpus: 4
+  microcode: __reload_late: cpu 2 entered
+  microcode: __reload_late: cpu 1 entered
+  microcode: __reload_late: cpu 3 entered
+  microcode: __reload_late: cpu 0 entered
+  microcode: __reload_late: cpu 1 left
+  microcode: Timeout while waiting for CPUs rendezvous, remaining: 1
+
+CPU1 above would finish, leave and the others will still spin waiting for
+it to join.
+
+So do two synchronization atomics instead, which makes the code a lot more
+straightforward.
+
+Also, since the update is serialized and it also takes quite some time per
+microcode engine, increase the exit timeout by the number of CPUs on the
+system.
+
+That's ok because the moment all CPUs are done, that timeout will be cut
+short.
+
+Furthermore, panic when some of the CPUs timeout when returning from a
+microcode update: we can't allow a system with not all cores updated.
+
+Also, as an optimization, do not do the exit sync if microcode wasn't
+updated.
+
+Reported-by: Emanuel Czirai <xftroxgpx@protonmail.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Emanuel Czirai <xftroxgpx@protonmail.com>
+Tested-by: Ashok Raj <ashok.raj@intel.com>
+Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
+Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@alien8.de
+---
+ arch/x86/kernel/cpu/microcode/core.c | 68 ++++++++++++++++++++++--------------
+ 1 file changed, 41 insertions(+), 27 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
+index 9f0fe5b..10c4fc2 100644
+--- a/arch/x86/kernel/cpu/microcode/core.c
++++ b/arch/x86/kernel/cpu/microcode/core.c
+@@ -517,7 +517,29 @@ static int check_online_cpus(void)
+ 	return -EINVAL;
+ }
+ 
+-static atomic_t late_cpus;
++static atomic_t late_cpus_in;
++static atomic_t late_cpus_out;
++
++static int __wait_for_cpus(atomic_t *t, long long timeout)
++{
++	int all_cpus = num_online_cpus();
++
++	atomic_inc(t);
++
++	while (atomic_read(t) < all_cpus) {
++		if (timeout < SPINUNIT) {
++			pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
++				all_cpus - atomic_read(t));
++			return 1;
++		}
++
++		ndelay(SPINUNIT);
++		timeout -= SPINUNIT;
++
++		touch_nmi_watchdog();
++	}
++	return 0;
++}
+ 
+ /*
+  * Returns:
+@@ -527,30 +549,16 @@ static atomic_t late_cpus;
+  */
+ static int __reload_late(void *info)
+ {
+-	unsigned int timeout = NSEC_PER_SEC;
+-	int all_cpus = num_online_cpus();
+ 	int cpu = smp_processor_id();
+ 	enum ucode_state err;
+ 	int ret = 0;
+ 
+-	atomic_dec(&late_cpus);
+-
+ 	/*
+ 	 * Wait for all CPUs to arrive. A load will not be attempted unless all
+ 	 * CPUs show up.
+ 	 * */
+-	while (atomic_read(&late_cpus)) {
+-		if (timeout < SPINUNIT) {
+-			pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
+-				atomic_read(&late_cpus));
+-			return -1;
+-		}
+-
+-		ndelay(SPINUNIT);
+-		timeout -= SPINUNIT;
+-
+-		touch_nmi_watchdog();
+-	}
++	if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
++		return -1;
+ 
+ 	spin_lock(&update_lock);
+ 	apply_microcode_local(&err);
+@@ -558,15 +566,22 @@ static int __reload_late(void *info)
+ 
+ 	if (err > UCODE_NFOUND) {
+ 		pr_warn("Error reloading microcode on CPU %d\n", cpu);
+-		ret = -1;
+-	} else if (err == UCODE_UPDATED) {
++		return -1;
++	/* siblings return UCODE_OK because their engine got updated already */
++	} else if (err == UCODE_UPDATED || err == UCODE_OK) {
+ 		ret = 1;
++	} else {
++		return ret;
+ 	}
+ 
+-	atomic_inc(&late_cpus);
+-
+-	while (atomic_read(&late_cpus) != all_cpus)
+-		cpu_relax();
++	/*
++	 * Increase the wait timeout to a safe value here since we're
++	 * serializing the microcode update and that could take a while on a
++	 * large number of CPUs. And that is fine as the *actual* timeout will
++	 * be determined by the last CPU finished updating and thus cut short.
++	 */
++	if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
++		panic("Timeout during microcode update!\n");
+ 
+ 	return ret;
+ }
+@@ -579,12 +594,11 @@ static int microcode_reload_late(void)
+ {
+ 	int ret;
+ 
+-	atomic_set(&late_cpus, num_online_cpus());
++	atomic_set(&late_cpus_in,  0);
++	atomic_set(&late_cpus_out, 0);
+ 
+ 	ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
+-	if (ret < 0)
+-		return ret;
+-	else if (ret > 0)
++	if (ret > 0)
+ 		microcode_check();
+ 
+ 	return ret;
+-- 
+cgit v1.1
+