summaryrefslogtreecommitdiffstats
path: root/arch/x86/entry/vdso
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry/vdso')
-rw-r--r--arch/x86/entry/vdso/.gitignore8
-rw-r--r--arch/x86/entry/vdso/Makefile220
-rwxr-xr-xarch/x86/entry/vdso/checkundef.sh10
-rw-r--r--arch/x86/entry/vdso/extable.c46
-rw-r--r--arch/x86/entry/vdso/extable.h28
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c85
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S111
-rw-r--r--arch/x86/entry/vdso/vdso-note.S15
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S35
-rw-r--r--arch/x86/entry/vdso/vdso2c.c254
-rw-r--r--arch/x86/entry/vdso/vdso2c.h228
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c81
-rw-r--r--arch/x86/entry/vdso/vdso32/.gitignore2
-rw-r--r--arch/x86/entry/vdso/vdso32/fake_32bit_build.h25
-rw-r--r--arch/x86/entry/vdso/vdso32/note.S18
-rw-r--r--arch/x86/entry/vdso/vdso32/sigreturn.S140
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S85
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c4
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso32.lds.S41
-rw-r--r--arch/x86/entry/vdso/vdso32/vgetcpu.c3
-rw-r--r--arch/x86/entry/vdso/vdsox32.lds.S27
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c22
-rw-r--r--arch/x86/entry/vdso/vma.c422
-rw-r--r--arch/x86/entry/vdso/vsgx.S151
24 files changed, 2061 insertions, 0 deletions
diff --git a/arch/x86/entry/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore
new file mode 100644
index 0000000000..37a6129d59
--- /dev/null
+++ b/arch/x86/entry/vdso/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso.lds
+vdsox32.lds
+vdso32-syscall-syms.lds
+vdso32-sysenter-syms.lds
+vdso32-int80-syms.lds
+vdso-image-*.c
+vdso2c
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
new file mode 100644
index 0000000000..6a1821bd7d
--- /dev/null
+++ b/arch/x86/entry/vdso/Makefile
@@ -0,0 +1,220 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Building vDSO images for x86.
+#
+
+# Include the generic Makefile to check the built vdso.
+include $(srctree)/lib/vdso/Makefile
+
+# Sanitizer runtimes are unavailable and cannot be linked here.
+KASAN_SANITIZE := n
+KMSAN_SANITIZE_vclock_gettime.o := n
+KMSAN_SANITIZE_vgetcpu.o := n
+
+UBSAN_SANITIZE := n
+KCSAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
+
+VDSO64-$(CONFIG_X86_64) := y
+VDSOX32-$(CONFIG_X86_X32_ABI) := y
+VDSO32-$(CONFIG_X86_32) := y
+VDSO32-$(CONFIG_IA32_EMULATION) := y
+
+# files to link into the vdso
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
+vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
+vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
+vobjs-$(CONFIG_X86_SGX) += vsgx.o
+
+# files to link into kernel
+obj-y += vma.o extable.o
+KASAN_SANITIZE_vma.o := y
+UBSAN_SANITIZE_vma.o := y
+KCSAN_SANITIZE_vma.o := y
+OBJECT_FILES_NON_STANDARD_vma.o := n
+OBJECT_FILES_NON_STANDARD_extable.o := n
+
+# vDSO images to build
+vdso_img-$(VDSO64-y) += 64
+vdso_img-$(VDSOX32-y) += x32
+vdso_img-$(VDSO32-y) += 32
+
+obj-$(VDSO32-y) += vdso32-setup.o
+
+vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
+vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
+
+$(obj)/vdso.o: $(obj)/vdso.so
+
+targets += vdso.lds $(vobjs-y)
+targets += vdso32/vdso32.lds $(vobjs32-y)
+
+# Build the vDSO image C files and link them in.
+vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
+vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
+vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
+obj-y += $(vdso_img_objs)
+targets += $(vdso_img_cfiles)
+targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
+
+CPPFLAGS_vdso.lds += -P -C
+
+VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
+ -z max-page-size=4096
+
+$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
+ $(call if_changed,vdso_and_check)
+
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
+hostprogs += vdso2c
+
+quiet_cmd_vdso2c = VDSO2C $@
+ cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@
+
+$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
+ $(call if_changed,vdso2c)
+
+#
+# Don't omit frame pointers for ease of userspace debugging, but do
+# optimize sibling calls.
+#
+CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
+ $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ CFL += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
+
+#
+# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
+#
+CFLAGS_REMOVE_vclock_gettime.o = -pg
+CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
+CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg
+CFLAGS_REMOVE_vsgx.o = -pg
+
+#
+# X32 processes use x32 vDSO to access 64bit kernel data.
+#
+# Build x32 vDSO image:
+# 1. Compile x32 vDSO as 64bit.
+# 2. Convert object files to x32.
+# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes
+# so that it can reach 64bit address space with 64bit pointers.
+#
+
+CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
+ -z max-page-size=4096
+
+# x32-rebranded versions
+vobjx32s-y := $(vobjs-y:.o=-x32.o)
+
+# same thing, but in the output directory
+vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
+
+# Convert 64bit object file to x32 for x32 vDSO.
+quiet_cmd_x32 = X32 $@
+ cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@
+
+$(obj)/%-x32.o: $(obj)/%.o FORCE
+ $(call if_changed,x32)
+
+targets += vdsox32.lds $(vobjx32s-y)
+
+$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
+ $(call if_changed,objcopy)
+
+$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
+ $(call if_changed,vdso_and_check)
+
+CPPFLAGS_vdso32/vdso32.lds = $(CPPFLAGS_vdso.lds)
+VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
+
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
+$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
+
+KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
+KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(PADDING_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
+KBUILD_CFLAGS_32 += -fno-stack-protector
+KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
+KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
+
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_VDSO_CFLAGS),)
+ KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
+endif
+endif
+
+$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+
+$(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
+ $(call if_changed,vdso_and_check)
+
+#
+# The DSO images are built using a special linker script.
+#
+quiet_cmd_vdso = VDSO $@
+ cmd_vdso = $(LD) -o $@ \
+ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+ -T $(filter %.lds,$^) $(filter %.o,$^) && \
+ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
+
+VDSO_LDFLAGS = -shared --hash-style=both --build-id=sha1 \
+ $(call ld-option, --eh-frame-hdr) -Bsymbolic -z noexecstack
+GCOV_PROFILE := n
+
+quiet_cmd_vdso_and_check = VDSO $@
+ cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
+
+#
+# Install the unstripped copies of vdso*.so. If our toolchain supports
+# build-id, install .build-id links as well.
+#
+quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
+define cmd_vdso_install
+ cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
+ if readelf -n $< |grep -q 'Build ID'; then \
+ buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+ first=`echo $$buildid | cut -b-2`; \
+ last=`echo $$buildid | cut -b3-`; \
+ mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+ ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+ fi
+endef
+
+vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
+
+$(MODLIB)/vdso: FORCE
+ @mkdir -p $(MODLIB)/vdso
+
+$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso
+ $(call cmd,vdso_install)
+
+PHONY += vdso_install $(vdso_img_insttargets)
+vdso_install: $(vdso_img_insttargets)
+
+clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh
new file mode 100755
index 0000000000..7ee90a9b54
--- /dev/null
+++ b/arch/x86/entry/vdso/checkundef.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+nm="$1"
+file="$2"
+$nm "$file" | grep '^ *U' > /dev/null 2>&1
+if [ $? -eq 1 ]; then
+ exit 0
+else
+ echo "$file: undefined symbols found" >&2
+ exit 1
+fi
diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c
new file mode 100644
index 0000000000..afcf5b65be
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <asm/current.h>
+#include <asm/traps.h>
+#include <asm/vdso.h>
+
+struct vdso_exception_table_entry {
+ int insn, fixup;
+};
+
+bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, unsigned long fault_addr)
+{
+ const struct vdso_image *image = current->mm->context.vdso_image;
+ const struct vdso_exception_table_entry *extable;
+ unsigned int nr_entries, i;
+ unsigned long base;
+
+ /*
+ * Do not attempt to fixup #DB or #BP. It's impossible to identify
+ * whether or not a #DB/#BP originated from within an SGX enclave and
+ * SGX enclaves are currently the only use case for vDSO fixup.
+ */
+ if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP)
+ return false;
+
+ if (!current->mm->context.vdso)
+ return false;
+
+ base = (unsigned long)current->mm->context.vdso + image->extable_base;
+ nr_entries = image->extable_len / (sizeof(*extable));
+ extable = image->extable;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (regs->ip == base + extable[i].insn) {
+ regs->ip = base + extable[i].fixup;
+ regs->di = trapnr;
+ regs->si = error_code;
+ regs->dx = fault_addr;
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/arch/x86/entry/vdso/extable.h b/arch/x86/entry/vdso/extable.h
new file mode 100644
index 0000000000..b56f6b0129
--- /dev/null
+++ b/arch/x86/entry/vdso/extable.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __VDSO_EXTABLE_H
+#define __VDSO_EXTABLE_H
+
+/*
+ * Inject exception fixup for vDSO code. Unlike normal exception fixup,
+ * vDSO uses a dedicated handler the addresses are relative to the overall
+ * exception table, not each individual entry.
+ */
+#ifdef __ASSEMBLY__
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
+ ASM_VDSO_EXTABLE_HANDLE from to
+
+.macro ASM_VDSO_EXTABLE_HANDLE from:req to:req
+ .pushsection __ex_table, "a"
+ .long (\from) - __ex_table
+ .long (\to) - __ex_table
+ .popsection
+.endm
+#else
+#define _ASM_VDSO_EXTABLE_HANDLE(from, to) \
+ ".pushsection __ex_table, \"a\"\n" \
+ ".long (" #from ") - __ex_table\n" \
+ ".long (" #to ") - __ex_table\n" \
+ ".popsection\n"
+#endif
+
+#endif /* __VDSO_EXTABLE_H */
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
new file mode 100644
index 0000000000..7d70935b67
--- /dev/null
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
+ *
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Copyright 2019 ARM Limited
+ *
+ * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ */
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "../../../../lib/vdso/gettimeofday.c"
+
+extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
+extern __kernel_old_time_t __vdso_time(__kernel_old_time_t *t);
+
+int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
+{
+ return __cvdso_gettimeofday(tv, tz);
+}
+
+int gettimeofday(struct __kernel_old_timeval *, struct timezone *)
+ __attribute__((weak, alias("__vdso_gettimeofday")));
+
+__kernel_old_time_t __vdso_time(__kernel_old_time_t *t)
+{
+ return __cvdso_time(t);
+}
+
+__kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time")));
+
+
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+/* both 64-bit and x32 use these */
+extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res);
+
+int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_getres(clockid_t clock,
+ struct __kernel_timespec *res)
+{
+ return __cvdso_clock_getres(clock, res);
+}
+int clock_getres(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
+
+#else
+/* i386 only */
+extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res);
+
+int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
+{
+ return __cvdso_clock_gettime32(clock, ts);
+}
+
+int clock_gettime(clockid_t, struct old_timespec32 *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
+
+int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts)
+{
+ return __cvdso_clock_gettime(clock, ts);
+}
+
+int clock_gettime64(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime64")));
+
+int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res)
+{
+ return __cvdso_clock_getres_time32(clock, res);
+}
+
+int clock_getres(clockid_t, struct old_timespec32 *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
+#endif
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
new file mode 100644
index 0000000000..bafa73f09e
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <asm/vdso.h>
+
+/*
+ * Linker script for vDSO. This is an ELF shared object prelinked to
+ * its virtual address, and with only one read-only segment.
+ * This script controls its layout.
+ */
+
+SECTIONS
+{
+ /*
+ * User/kernel shared data is before the vDSO. This may be a little
+ * uglier than putting it after the vDSO, but it avoids issues with
+ * non-allocatable things that dangle past the end of the PT_LOAD
+ * segment.
+ */
+
+ vvar_start = . - 4 * PAGE_SIZE;
+ vvar_page = vvar_start;
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
+#include <asm/vvar.h>
+#undef EMIT_VVAR
+
+ pvclock_page = vvar_start + PAGE_SIZE;
+ hvclock_page = vvar_start + 2 * PAGE_SIZE;
+ timens_page = vvar_start + 3 * PAGE_SIZE;
+
+#undef _ASM_X86_VVAR_H
+ /* Place all vvars in timens too at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset;
+#include <asm/vvar.h>
+#undef EMIT_VVAR
+
+ . = SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : {
+ *(.rodata*)
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+ } :text
+
+ /*
+ * Discard .note.gnu.property sections which are unused and have
+ * different alignment requirement from vDSO note sections.
+ */
+ /DISCARD/ : {
+ *(.note.gnu.property)
+ }
+ .note : { *(.note.*) } :text :note
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+
+
+ /*
+ * Text is well-separated from actual data: there's plenty of
+ * stuff that isn't used at runtime in between.
+ */
+
+ .text : {
+ *(.text*)
+ } :text =0x90909090,
+
+
+
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
+
+ __ex_table : { *(__ex_table) } :text
+
+ /DISCARD/ : {
+ *(.discard)
+ *(.discard.*)
+ *(__bug_table)
+ }
+}
+
+/*
+ * Very old versions of ld do not recognize this name token; use the constant.
+ */
+#define PT_GNU_EH_FRAME 0x6474e550
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ note PT_NOTE FLAGS(4); /* PF_R */
+ eh_frame_hdr PT_GNU_EH_FRAME;
+}
diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
new file mode 100644
index 0000000000..7942317011
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso-note.S
@@ -0,0 +1,15 @@
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/uts.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
new file mode 100644
index 0000000000..e8c60ae7a7
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 64-bit vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSO64
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+ LINUX_2.6 {
+ global:
+ clock_gettime;
+ __vdso_clock_gettime;
+ gettimeofday;
+ __vdso_gettimeofday;
+ getcpu;
+ __vdso_getcpu;
+ time;
+ __vdso_time;
+ clock_getres;
+ __vdso_clock_getres;
+#ifdef CONFIG_X86_SGX
+ __vdso_sgx_enter_enclave;
+#endif
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
new file mode 100644
index 0000000000..90d15f2a72
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vdso2c - A vdso image preparation tool
+ * Copyright (c) 2014 Andy Lutomirski and others
+ *
+ * vdso2c requires stripped and unstripped input. It would be trivial
+ * to fully strip the input in here, but, for reasons described below,
+ * we need to write a section table. Doing this is more or less
+ * equivalent to dropping all non-allocatable sections, but it's
+ * easier to let objcopy handle that instead of doing it ourselves.
+ * If we ever need to do something fancier than what objcopy provides,
+ * it would be straightforward to add here.
+ *
+ * We're keep a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <err.h>
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <tools/le_byteshift.h>
+
+#include <linux/elf.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+const char *outfilename;
+
+/* Symbols that we need in vdso2c. */
+enum {
+ sym_vvar_start,
+ sym_vvar_page,
+ sym_pvclock_page,
+ sym_hvclock_page,
+ sym_timens_page,
+};
+
+const int special_pages[] = {
+ sym_vvar_page,
+ sym_pvclock_page,
+ sym_hvclock_page,
+ sym_timens_page,
+};
+
+struct vdso_sym {
+ const char *name;
+ bool export;
+};
+
+struct vdso_sym required_syms[] = {
+ [sym_vvar_start] = {"vvar_start", true},
+ [sym_vvar_page] = {"vvar_page", true},
+ [sym_pvclock_page] = {"pvclock_page", true},
+ [sym_hvclock_page] = {"hvclock_page", true},
+ [sym_timens_page] = {"timens_page", true},
+ {"VDSO32_NOTE_MASK", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
+ {"int80_landing_pad", true},
+ {"vdso32_rt_sigreturn_landing_pad", true},
+ {"vdso32_sigreturn_landing_pad", true},
+};
+
+__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
+static void fail(const char *format, ...)
+{
+ va_list ap;
+ va_start(ap, format);
+ fprintf(stderr, "Error: ");
+ vfprintf(stderr, format, ap);
+ if (outfilename)
+ unlink(outfilename);
+ exit(1);
+ va_end(ap);
+}
+
+/*
+ * Evil macros for little-endian reads and writes
+ */
+#define GLE(x, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot)
+
+extern void bad_get_le(void);
+#define LAST_GLE(x) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le())
+
+#define GET_LE(x) \
+ GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x))))
+
+#define PLE(x, val, bits, ifnot) \
+ __builtin_choose_expr( \
+ (sizeof(*(x)) == bits/8), \
+ put_unaligned_le##bits((val), (x)), ifnot)
+
+extern void bad_put_le(void);
+#define LAST_PLE(x, val) \
+ __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le())
+
+#define PUT_LE(x, val) \
+ PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val))))
+
+
+#define NSYMS ARRAY_SIZE(required_syms)
+
+#define BITSFUNC3(name, bits, suffix) name##bits##suffix
+#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
+
+#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
+#include "vdso2c.h"
+#undef ELF_BITS
+
+#define ELF_BITS 32
+#include "vdso2c.h"
+#undef ELF_BITS
+
+static void go(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
+{
+ Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
+
+ if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
+ go64(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
+ go32(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
+ } else {
+ fail("unknown ELF class\n");
+ }
+}
+
+static void map_input(const char *name, void **addr, size_t *len, int prot)
+{
+ off_t tmp_len;
+
+ int fd = open(name, O_RDONLY);
+ if (fd == -1)
+ err(1, "open(%s)", name);
+
+ tmp_len = lseek(fd, 0, SEEK_END);
+ if (tmp_len == (off_t)-1)
+ err(1, "lseek");
+ *len = (size_t)tmp_len;
+
+ *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
+ if (*addr == MAP_FAILED)
+ err(1, "mmap");
+
+ close(fd);
+}
+
+int main(int argc, char **argv)
+{
+ size_t raw_len, stripped_len;
+ void *raw_addr, *stripped_addr;
+ FILE *outfile;
+ char *name, *tmp;
+ int namelen;
+
+ if (argc != 4) {
+ printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
+ return 1;
+ }
+
+ /*
+ * Figure out the struct name. If we're writing to a .so file,
+ * generate raw output instead.
+ */
+ name = strdup(argv[3]);
+ namelen = strlen(name);
+ if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
+ name = NULL;
+ } else {
+ tmp = strrchr(name, '/');
+ if (tmp)
+ name = tmp + 1;
+ tmp = strchr(name, '.');
+ if (tmp)
+ *tmp = '\0';
+ for (tmp = name; *tmp; tmp++)
+ if (*tmp == '-')
+ *tmp = '_';
+ }
+
+ map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
+ map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
+
+ outfilename = argv[3];
+ outfile = fopen(outfilename, "w");
+ if (!outfile)
+ err(1, "fopen(%s)", outfilename);
+
+ go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
+
+ munmap(raw_addr, raw_len);
+ munmap(stripped_addr, stripped_len);
+ fclose(outfile);
+
+ return 0;
+}
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
new file mode 100644
index 0000000000..67b3e37576
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -0,0 +1,228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This file is included twice from vdso2c.c. It generates code for 32-bit
+ * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs
+ * are built for 32-bit userspace.
+ */
+
+static void BITSFUNC(copy)(FILE *outfile, const unsigned char *data, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (i % 10 == 0)
+ fprintf(outfile, "\n\t");
+ fprintf(outfile, "0x%02X, ", (int)(data)[i]);
+ }
+}
+
+
+/*
+ * Extract a section from the input data into a standalone blob. Used to
+ * capture kernel-only data that needs to persist indefinitely, e.g. the
+ * exception fixup tables, but only in the kernel, i.e. the section can
+ * be stripped from the final vDSO image.
+ */
+static void BITSFUNC(extract)(const unsigned char *data, size_t data_len,
+ FILE *outfile, ELF(Shdr) *sec, const char *name)
+{
+ unsigned long offset;
+ size_t len;
+
+ offset = (unsigned long)GET_LE(&sec->sh_offset);
+ len = (size_t)GET_LE(&sec->sh_size);
+
+ if (offset + len > data_len)
+ fail("section to extract overruns input data");
+
+ fprintf(outfile, "static const unsigned char %s[%zu] = {", name, len);
+ BITSFUNC(copy)(outfile, data + offset, len);
+ fprintf(outfile, "\n};\n\n");
+}
+
+static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *image_name)
+{
+ int found_load = 0;
+ unsigned long load_size = -1; /* Work around bogus warning */
+ unsigned long mapping_size;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
+ unsigned long i, syms_nr;
+ ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+ *alt_sec = NULL, *extable_sec = NULL;
+ ELF(Dyn) *dyn = 0, *dyn_end = 0;
+ const char *secstrings;
+ INT_BITS syms[NSYMS] = {};
+
+ ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
+
+ if (GET_LE(&hdr->e_type) != ET_DYN)
+ fail("input is not a shared object\n");
+
+ /* Walk the segment table. */
+ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
+ if (GET_LE(&pt[i].p_type) == PT_LOAD) {
+ if (found_load)
+ fail("multiple PT_LOAD segs\n");
+
+ if (GET_LE(&pt[i].p_offset) != 0 ||
+ GET_LE(&pt[i].p_vaddr) != 0)
+ fail("PT_LOAD in wrong place\n");
+
+ if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz))
+ fail("cannot handle memsz != filesz\n");
+
+ load_size = GET_LE(&pt[i].p_memsz);
+ found_load = 1;
+ } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
+ dyn = raw_addr + GET_LE(&pt[i].p_offset);
+ dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
+ GET_LE(&pt[i].p_memsz);
+ }
+ }
+ if (!found_load)
+ fail("no PT_LOAD seg\n");
+
+ if (stripped_len < load_size)
+ fail("stripped input is too short\n");
+
+ if (!dyn)
+ fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n");
+
+ /* Walk the dynamic table */
+ for (i = 0; dyn + i < dyn_end &&
+ GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
+ typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
+ if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
+ tag == DT_RELENT || tag == DT_TEXTREL)
+ fail("vdso image contains dynamic relocations\n");
+ }
+
+ /* Walk the section table */
+ secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
+ secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
+ for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
+ ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * i;
+ if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
+ symtab_hdr = sh;
+
+ if (!strcmp(secstrings + GET_LE(&sh->sh_name),
+ ".altinstructions"))
+ alt_sec = sh;
+ if (!strcmp(secstrings + GET_LE(&sh->sh_name), "__ex_table"))
+ extable_sec = sh;
+ }
+
+ if (!symtab_hdr)
+ fail("no symbol table\n");
+
+ strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
+ GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
+
+ syms_nr = GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
+ /* Walk the symbol table */
+ for (i = 0; i < syms_nr; i++) {
+ unsigned int k;
+ ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
+ GET_LE(&symtab_hdr->sh_entsize) * i;
+ const char *sym_name = raw_addr +
+ GET_LE(&strtab_hdr->sh_offset) +
+ GET_LE(&sym->st_name);
+
+ for (k = 0; k < NSYMS; k++) {
+ if (!strcmp(sym_name, required_syms[k].name)) {
+ if (syms[k]) {
+ fail("duplicate symbol %s\n",
+ required_syms[k].name);
+ }
+
+ /*
+ * Careful: we use negative addresses, but
+ * st_value is unsigned, so we rely
+ * on syms[k] being a signed type of the
+ * correct width.
+ */
+ syms[k] = GET_LE(&sym->st_value);
+ }
+ }
+ }
+
+ /* Validate mapping addresses. */
+ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
+ INT_BITS symval = syms[special_pages[i]];
+
+ if (!symval)
+ continue; /* The mapping isn't used; ignore it. */
+
+ if (symval % 4096)
+ fail("%s must be a multiple of 4096\n",
+ required_syms[i].name);
+ if (symval + 4096 < syms[sym_vvar_start])
+ fail("%s underruns vvar_start\n",
+ required_syms[i].name);
+ if (symval + 4096 > 0)
+ fail("%s is on the wrong side of the vdso text\n",
+ required_syms[i].name);
+ }
+ if (syms[sym_vvar_start] % 4096)
+ fail("vvar_begin must be a multiple of 4096\n");
+
+ if (!image_name) {
+ fwrite(stripped_addr, stripped_len, 1, outfile);
+ return;
+ }
+
+ mapping_size = (stripped_len + 4095) / 4096 * 4096;
+
+ fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
+ fprintf(outfile, "#include <linux/linkage.h>\n");
+ fprintf(outfile, "#include <linux/init.h>\n");
+ fprintf(outfile, "#include <asm/page_types.h>\n");
+ fprintf(outfile, "#include <asm/vdso.h>\n");
+ fprintf(outfile, "\n");
+ fprintf(outfile,
+ "static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
+ mapping_size);
+ for (i = 0; i < stripped_len; i++) {
+ if (i % 10 == 0)
+ fprintf(outfile, "\n\t");
+ fprintf(outfile, "0x%02X, ",
+ (int)((unsigned char *)stripped_addr)[i]);
+ }
+ fprintf(outfile, "\n};\n\n");
+ if (extable_sec)
+ BITSFUNC(extract)(raw_addr, raw_len, outfile,
+ extable_sec, "extable");
+
+ fprintf(outfile, "const struct vdso_image %s = {\n", image_name);
+ fprintf(outfile, "\t.data = raw_data,\n");
+ fprintf(outfile, "\t.size = %lu,\n", mapping_size);
+ if (alt_sec) {
+ fprintf(outfile, "\t.alt = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_offset));
+ fprintf(outfile, "\t.alt_len = %lu,\n",
+ (unsigned long)GET_LE(&alt_sec->sh_size));
+ }
+ if (extable_sec) {
+ fprintf(outfile, "\t.extable_base = %lu,\n",
+ (unsigned long)GET_LE(&extable_sec->sh_offset));
+ fprintf(outfile, "\t.extable_len = %lu,\n",
+ (unsigned long)GET_LE(&extable_sec->sh_size));
+ fprintf(outfile, "\t.extable = extable,\n");
+ }
+
+ for (i = 0; i < NSYMS; i++) {
+ if (required_syms[i].export && syms[i])
+ fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
+ required_syms[i].name, (int64_t)syms[i]);
+ }
+ fprintf(outfile, "};\n\n");
+ fprintf(outfile, "static __init int init_%s(void) {\n", image_name);
+ fprintf(outfile, "\treturn init_vdso_image(&%s);\n", image_name);
+ fprintf(outfile, "};\n");
+ fprintf(outfile, "subsys_initcall(init_%s);\n", image_name);
+
+}
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
new file mode 100644
index 0000000000..f3b3cacbcb
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/kernel.h>
+#include <linux/mm_types.h>
+#include <linux/elf.h>
+
+#include <asm/processor.h>
+#include <asm/vdso.h>
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT 0
+#else
+#define VDSO_DEFAULT 1
+#endif
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT;
+
+static int __init vdso32_setup(char *s)
+{
+ vdso32_enabled = simple_strtoul(s, NULL, 0);
+
+ if (vdso32_enabled > 1) {
+ pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n");
+ vdso32_enabled = 0;
+ }
+
+ return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso32_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
+#endif
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static struct ctl_table abi_table2[] = {
+ {
+ .procname = "vsyscall32",
+ .data = &vdso32_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+ register_sysctl("abi", abi_table2);
+ return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif /* CONFIG_SYSCTL */
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore
new file mode 100644
index 0000000000..5167384843
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso32.lds
diff --git a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
new file mode 100644
index 0000000000..db1b15f686
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_64
+
+/*
+ * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
+ * configuration
+ */
+#undef CONFIG_64BIT
+#undef CONFIG_X86_64
+#undef CONFIG_COMPAT
+#undef CONFIG_PGTABLE_LEVELS
+#undef CONFIG_ILLEGAL_POINTER_VALUE
+#undef CONFIG_SPARSEMEM_VMEMMAP
+#undef CONFIG_NR_CPUS
+#undef CONFIG_PARAVIRT_XXL
+
+#define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
+#define CONFIG_PAGE_OFFSET 0
+#define CONFIG_ILLEGAL_POINTER_VALUE 0
+#define CONFIG_NR_CPUS 1
+
+#define BUILD_VDSO32_64
+
+#endif
diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
new file mode 100644
index 0000000000..2cbd39939d
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/note.S
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
+ * Here we can supply some information useful to userland.
+ */
+
+#include <linux/build-salt.h>
+#include <linux/version.h>
+#include <linux/elfnote.h>
+
+/* Ideally this would use UTS_NAME, but using a quoted string here
+ doesn't work. Remember to change this when changing the
+ kernel's name. */
+ELFNOTE_START(Linux, 0, "a")
+ .long LINUX_VERSION_CODE
+ELFNOTE_END
+
+BUILD_SALT
diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
new file mode 100644
index 0000000000..1bd068f72d
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/unistd_32.h>
+#include <asm/asm-offsets.h>
+
+#ifndef SYSCALL_ENTER_KERNEL
+#define SYSCALL_ENTER_KERNEL int $0x80
+#endif
+
+ .text
+ .globl __kernel_sigreturn
+ .type __kernel_sigreturn,@function
+ nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
+ ALIGN
+__kernel_sigreturn:
+.LSTART_sigreturn:
+ popl %eax /* XXX does this mean it needs unwind info? */
+ movl $__NR_sigreturn, %eax
+ SYSCALL_ENTER_KERNEL
+.LEND_sigreturn:
+SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL)
+ nop
+ .size __kernel_sigreturn,.-.LSTART_sigreturn
+
+ .globl __kernel_rt_sigreturn
+ .type __kernel_rt_sigreturn,@function
+ ALIGN
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+ movl $__NR_rt_sigreturn, %eax
+ SYSCALL_ENTER_KERNEL
+.LEND_rt_sigreturn:
+SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL)
+ nop
+ .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+ .previous
+
+ .section .eh_frame,"a",@progbits
+.LSTARTFRAMEDLSI1:
+ .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
+.LSTARTCIEDLSI1:
+ .long 0 /* CIE ID */
+ .byte 1 /* Version number */
+ .string "zRS" /* NUL-terminated augmentation string */
+ .uleb128 1 /* Code alignment factor */
+ .sleb128 -4 /* Data alignment factor */
+ .byte 8 /* Return address register column */
+ .uleb128 1 /* Augmentation value length */
+ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+ .byte 0 /* DW_CFA_nop */
+ .align 4
+.LENDCIEDLSI1:
+ .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
+.LSTARTFDEDLSI1:
+ .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
+ /* HACK: The dwarf2 unwind routines will subtract 1 from the
+ return address to get an address in the middle of the
+ presumed call instruction. Since we didn't get here via
+ a call, we need to include the nop before the real start
+ to make up for it. */
+ .long .LSTART_sigreturn-1-. /* PC-relative start address */
+ .long .LEND_sigreturn-.LSTART_sigreturn+1
+ .uleb128 0 /* Augmentation */
+ /* What follows are the instructions for the table generation.
+ We record the locations of each register saved. This is
+ complicated by the fact that the "CFA" is always assumed to
+ be the value of the stack pointer in the caller. This means
+ that we must define the CFA of this body of code to be the
+ saved value of the stack pointer in the sigcontext. Which
+ also means that there is no fixed relation to the other
+ saved registers, which means that we must use DW_CFA_expression
+ to compute their addresses. It also means that when we
+ adjust the stack with the popl, we have to do it all over again. */
+
+#define do_cfa_expr(offset) \
+ .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
+ .uleb128 1f-0f; /* length */ \
+0: .byte 0x74; /* DW_OP_breg4 */ \
+ .sleb128 offset; /* offset */ \
+ .byte 0x06; /* DW_OP_deref */ \
+1:
+
+#define do_expr(regno, offset) \
+ .byte 0x10; /* DW_CFA_expression */ \
+ .uleb128 regno; /* regno */ \
+ .uleb128 1f-0f; /* length */ \
+0: .byte 0x74; /* DW_OP_breg4 */ \
+ .sleb128 offset; /* offset */ \
+1:
+
+ do_cfa_expr(IA32_SIGCONTEXT_sp+4)
+ do_expr(0, IA32_SIGCONTEXT_ax+4)
+ do_expr(1, IA32_SIGCONTEXT_cx+4)
+ do_expr(2, IA32_SIGCONTEXT_dx+4)
+ do_expr(3, IA32_SIGCONTEXT_bx+4)
+ do_expr(5, IA32_SIGCONTEXT_bp+4)
+ do_expr(6, IA32_SIGCONTEXT_si+4)
+ do_expr(7, IA32_SIGCONTEXT_di+4)
+ do_expr(8, IA32_SIGCONTEXT_ip+4)
+
+ .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+ do_cfa_expr(IA32_SIGCONTEXT_sp)
+ do_expr(0, IA32_SIGCONTEXT_ax)
+ do_expr(1, IA32_SIGCONTEXT_cx)
+ do_expr(2, IA32_SIGCONTEXT_dx)
+ do_expr(3, IA32_SIGCONTEXT_bx)
+ do_expr(5, IA32_SIGCONTEXT_bp)
+ do_expr(6, IA32_SIGCONTEXT_si)
+ do_expr(7, IA32_SIGCONTEXT_di)
+ do_expr(8, IA32_SIGCONTEXT_ip)
+
+ .align 4
+.LENDFDEDLSI1:
+
+ .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
+.LSTARTFDEDLSI2:
+ .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
+ /* HACK: See above wrt unwind library assumptions. */
+ .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
+ .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+ .uleb128 0 /* Augmentation */
+ /* What follows are the instructions for the table generation.
+ We record the locations of each register saved. This is
+ slightly less complicated than the above, since we don't
+ modify the stack pointer in the process. */
+
+ do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp)
+ do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax)
+ do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx)
+ do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx)
+ do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx)
+ do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp)
+ do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si)
+ do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di)
+ do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip)
+
+ .align 4
+.LENDFDEDLSI2:
+ .previous
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
new file mode 100644
index 0000000000..d33c6513fd
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AT_SYSINFO entry point
+*/
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
+
+ .text
+ .globl __kernel_vsyscall
+ .type __kernel_vsyscall,@function
+ ALIGN
+__kernel_vsyscall:
+ CFI_STARTPROC
+ /*
+ * Reshuffle regs so that all of any of the entry instructions
+ * will preserve enough state.
+ *
+ * A really nice entry sequence would be:
+ * pushl %edx
+ * pushl %ecx
+ * movl %esp, %ecx
+ *
+ * Unfortunately, naughty Android versions between July and December
+ * 2015 actually hardcode the traditional Linux SYSENTER entry
+ * sequence. That is severely broken for a number of reasons (ask
+ * anyone with an AMD CPU, for example). Nonetheless, we try to keep
+ * it working approximately as well as it ever worked.
+ *
+ * This link may elucidate some of the history:
+ * https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
+ * personally, I find it hard to understand what's going on there.
+ *
+ * Note to future user developers: DO NOT USE SYSENTER IN YOUR CODE.
+ * Execute an indirect call to the address in the AT_SYSINFO auxv
+ * entry. That is the ONLY correct way to make a fast 32-bit system
+ * call on Linux. (Open-coding int $0x80 is also fine, but it's
+ * slow.)
+ */
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
+
+ #define SYSENTER_SEQUENCE "movl %esp, %ebp; sysenter"
+ #define SYSCALL_SEQUENCE "movl %ecx, %ebp; syscall"
+
+#ifdef CONFIG_X86_64
+ /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
+ ALTERNATIVE_2 "", SYSENTER_SEQUENCE, X86_FEATURE_SYSENTER32, \
+ SYSCALL_SEQUENCE, X86_FEATURE_SYSCALL32
+#else
+ ALTERNATIVE "", SYSENTER_SEQUENCE, X86_FEATURE_SEP
+#endif
+
+ /* Enter using int $0x80 */
+ int $0x80
+SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL)
+
+ /*
+ * Restore EDX and ECX in case they were clobbered. EBP is not
+ * clobbered (the kernel restores it), but it's cleaner and
+ * probably faster to pop it than to adjust ESP using addl.
+ */
+ popl %ebp
+ CFI_RESTORE ebp
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %edx
+ CFI_RESTORE edx
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ RET
+ CFI_ENDPROC
+
+ .size __kernel_vsyscall,.-__kernel_vsyscall
+ .previous
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
new file mode 100644
index 0000000000..86981decfe
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BUILD_VDSO32
+#include "fake_32bit_build.h"
+#include "../vclock_gettime.c"
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
new file mode 100644
index 0000000000..8a3be07006
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for 32-bit vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#include <asm/page.h>
+
+#define BUILD_VDSO32
+
+#include "../vdso-layout.lds.S"
+
+/* The ELF entry point can be used to set the AT_SYSINFO value. */
+ENTRY(__kernel_vsyscall);
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION
+{
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_time;
+ __vdso_clock_getres;
+ __vdso_clock_gettime64;
+ __vdso_getcpu;
+ };
+
+ LINUX_2.5 {
+ global:
+ __kernel_vsyscall;
+ __kernel_sigreturn;
+ __kernel_rt_sigreturn;
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vdso32/vgetcpu.c b/arch/x86/entry/vdso/vdso32/vgetcpu.c
new file mode 100644
index 0000000000..3a9791f5e9
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vgetcpu.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "fake_32bit_build.h"
+#include "../vgetcpu.c"
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
new file mode 100644
index 0000000000..16a8050a4f
--- /dev/null
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linker script for x32 vDSO.
+ * We #include the file to define the layout details.
+ *
+ * This file defines the version script giving the user-exported symbols in
+ * the DSO.
+ */
+
+#define BUILD_VDSOX32
+
+#include "vdso-layout.lds.S"
+
+/*
+ * This controls what userland symbols we export from the vDSO.
+ */
+VERSION {
+ LINUX_2.6 {
+ global:
+ __vdso_clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_getcpu;
+ __vdso_time;
+ __vdso_clock_getres;
+ local: *;
+ };
+}
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
new file mode 100644
index 0000000000..e4640306b2
--- /dev/null
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ *
+ * Fast user context implementation of getcpu()
+ */
+
+#include <linux/kernel.h>
+#include <linux/getcpu.h>
+#include <asm/segment.h>
+#include <vdso/processor.h>
+
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+{
+ vdso_read_cpunode(cpu, node);
+
+ return 0;
+}
+
+long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
+ __attribute__((weak, alias("__vdso_getcpu")));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
new file mode 100644
index 0000000000..7645730dc2
--- /dev/null
+++ b/arch/x86/entry/vdso/vma.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2007 Andi Kleen, SUSE Labs.
+ *
+ * This contains most of the x86 vDSO kernel-side code.
+ */
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/random.h>
+#include <linux/elf.h>
+#include <linux/cpu.h>
+#include <linux/ptrace.h>
+#include <linux/time_namespace.h>
+
+#include <asm/pvclock.h>
+#include <asm/vgtod.h>
+#include <asm/proto.h>
+#include <asm/vdso.h>
+#include <asm/vvar.h>
+#include <asm/tlb.h>
+#include <asm/page.h>
+#include <asm/desc.h>
+#include <asm/cpufeature.h>
+#include <clocksource/hyperv_timer.h>
+
+#undef _ASM_X86_VVAR_H
+#define EMIT_VVAR(name, offset) \
+ const size_t name ## _offset = offset;
+#include <asm/vvar.h>
+
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
+{
+ return (struct vdso_data *)(vvar_page + _vdso_data_offset);
+}
+#undef EMIT_VVAR
+
+unsigned int vclocks_used __read_mostly;
+
+#if defined(CONFIG_X86_64)
+unsigned int __read_mostly vdso64_enabled = 1;
+#endif
+
+int __init init_vdso_image(const struct vdso_image *image)
+{
+ BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32);
+ BUG_ON(image->size % PAGE_SIZE != 0);
+
+ apply_alternatives((struct alt_instr *)(image->data + image->alt),
+ (struct alt_instr *)(image->data + image->alt +
+ image->alt_len));
+
+ return 0;
+}
+
+static const struct vm_special_mapping vvar_mapping;
+struct linux_binprm;
+
+static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+ if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+ return VM_FAULT_SIGBUS;
+
+ vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+ get_page(vmf->page);
+ return 0;
+}
+
+static void vdso_fix_landing(const struct vdso_image *image,
+ struct vm_area_struct *new_vma)
+{
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ if (in_ia32_syscall() && image == &vdso_image_32) {
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long vdso_land = image->sym_int80_landing_pad;
+ unsigned long old_land_addr = vdso_land +
+ (unsigned long)current->mm->context.vdso;
+
+ /* Fixing userspace landing - look at do_fast_syscall_32 */
+ if (regs->ip == old_land_addr)
+ regs->ip = new_vma->vm_start + vdso_land;
+ }
+#endif
+}
+
+static int vdso_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ const struct vdso_image *image = current->mm->context.vdso_image;
+
+ vdso_fix_landing(image, new_vma);
+ current->mm->context.vdso = (void __user *)new_vma->vm_start;
+
+ return 0;
+}
+
+#ifdef CONFIG_TIME_NS
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_data() for details.
+ */
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (vma_is_special_mapping(vma, &vvar_mapping))
+ zap_vma_pages(vma);
+ }
+ mmap_read_unlock(mm);
+
+ return 0;
+}
+#endif
+
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+ unsigned long pfn;
+ long sym_offset;
+
+ if (!image)
+ return VM_FAULT_SIGBUS;
+
+ sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+ image->sym_vvar_start;
+
+ /*
+ * Sanity check: a symbol offset of zero means that the page
+ * does not exist for this vdso image, not that the page is at
+ * offset zero relative to the text mapping. This should be
+ * impossible here, because sym_offset should only be zero for
+ * the page past the end of the vvar mapping.
+ */
+ if (sym_offset == 0)
+ return VM_FAULT_SIGBUS;
+
+ if (sym_offset == image->sym_vvar_page) {
+ struct page *timens_page = find_timens_vvar_page(vma);
+
+ pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
+
+ /*
+ * If a task belongs to a time namespace then a namespace
+ * specific VVAR is mapped with the sym_vvar_page offset and
+ * the real VVAR page is mapped with the sym_timens_page
+ * offset.
+ * See also the comment near timens_setup_vdso_data().
+ */
+ if (timens_page) {
+ unsigned long addr;
+ vm_fault_t err;
+
+ /*
+ * Optimization: inside time namespace pre-fault
+ * VVAR page too. As on timens page there are only
+ * offsets for clocks on VVAR, it'll be faulted
+ * shortly by VDSO code.
+ */
+ addr = vmf->address + (image->sym_timens_page - sym_offset);
+ err = vmf_insert_pfn(vma, addr, pfn);
+ if (unlikely(err & VM_FAULT_ERROR))
+ return err;
+
+ pfn = page_to_pfn(timens_page);
+ }
+
+ return vmf_insert_pfn(vma, vmf->address, pfn);
+ } else if (sym_offset == image->sym_pvclock_page) {
+ struct pvclock_vsyscall_time_info *pvti =
+ pvclock_get_pvti_cpu0_va();
+ if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) {
+ return vmf_insert_pfn_prot(vma, vmf->address,
+ __pa(pvti) >> PAGE_SHIFT,
+ pgprot_decrypted(vma->vm_page_prot));
+ }
+ } else if (sym_offset == image->sym_hvclock_page) {
+ pfn = hv_get_tsc_pfn();
+
+ if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
+ return vmf_insert_pfn(vma, vmf->address, pfn);
+ } else if (sym_offset == image->sym_timens_page) {
+ struct page *timens_page = find_timens_vvar_page(vma);
+
+ if (!timens_page)
+ return VM_FAULT_SIGBUS;
+
+ pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
+ return vmf_insert_pfn(vma, vmf->address, pfn);
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_special_mapping vdso_mapping = {
+ .name = "[vdso]",
+ .fault = vdso_fault,
+ .mremap = vdso_mremap,
+};
+static const struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .fault = vvar_fault,
+};
+
+/*
+ * Add vdso and vvar mappings to current process.
+ * @image - blob to map
+ * @addr - request a specific address (zero to map at free addr)
+ */
+static int map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long text_start;
+ int ret = 0;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ addr = get_unmapped_area(NULL, addr,
+ image->size - image->sym_vvar_start, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+
+ text_start = addr - image->sym_vvar_start;
+
+ /*
+ * MAYWRITE to allow gdb to COW and set breakpoints
+ */
+ vma = _install_special_mapping(mm,
+ text_start,
+ image->size,
+ VM_READ|VM_EXEC|
+ VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+ &vdso_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto up_fail;
+ }
+
+ vma = _install_special_mapping(mm,
+ addr,
+ -image->sym_vvar_start,
+ VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+ VM_PFNMAP,
+ &vvar_mapping);
+
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ do_munmap(mm, text_start, image->size, NULL);
+ } else {
+ current->mm->context.vdso = (void __user *)text_start;
+ current->mm->context.vdso_image = image;
+ }
+
+up_fail:
+ mmap_write_unlock(mm);
+ return ret;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Put the vdso above the (randomized) stack with another randomized
+ * offset. This way there is no hole in the middle of address space.
+ * To save memory make sure it is still in the same PTE as the stack
+ * top. This doesn't give that many random bits.
+ *
+ * Note that this algorithm is imperfect: the distribution of the vdso
+ * start address within a PMD is biased toward the end.
+ *
+ * Only used for the 64-bit and x32 vdsos.
+ */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+ unsigned long addr, end;
+ unsigned offset;
+
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= DEFAULT_MAP_WINDOW)
+ end = DEFAULT_MAP_WINDOW;
+ end -= len;
+
+ if (end > start) {
+ offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
+
+ /*
+ * Forcibly align the final address in case we have a hardware
+ * issue that requires alignment for performance reasons.
+ */
+ addr = align_vdso_addr(addr);
+
+ return addr;
+}
+
+static int map_vdso_randomized(const struct vdso_image *image)
+{
+ unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
+
+ return map_vdso(image, addr);
+}
+#endif
+
+int map_vdso_once(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mmap_write_lock(mm);
+ /*
+ * Check if we have already mapped vdso blob - fail to prevent
+ * abusing from userspace install_special_mapping, which may
+ * not do accounting and rlimit right.
+ * We could search vma near context.vdso, but it's a slowpath,
+ * so let's explicitly check all VMAs to be completely sure.
+ */
+ for_each_vma(vmi, vma) {
+ if (vma_is_special_mapping(vma, &vdso_mapping) ||
+ vma_is_special_mapping(vma, &vvar_mapping)) {
+ mmap_write_unlock(mm);
+ return -EEXIST;
+ }
+ }
+ mmap_write_unlock(mm);
+
+ return map_vdso(image, addr);
+}
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+static int load_vdso32(void)
+{
+ if (vdso32_enabled != 1) /* Other values all mean "disabled" */
+ return 0;
+
+ return map_vdso(&vdso_image_32, 0);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ if (!vdso64_enabled)
+ return 0;
+
+ return map_vdso_randomized(&vdso_image_64);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp, bool x32)
+{
+#ifdef CONFIG_X86_X32_ABI
+ if (x32) {
+ if (!vdso64_enabled)
+ return 0;
+ return map_vdso_randomized(&vdso_image_x32);
+ }
+#endif
+#ifdef CONFIG_IA32_EMULATION
+ return load_vdso32();
+#else
+ return 0;
+#endif
+}
+#endif
+#else
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+ return load_vdso32();
+}
+#endif
+
+bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs)
+{
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+ const struct vdso_image *image = current->mm->context.vdso_image;
+ unsigned long vdso = (unsigned long) current->mm->context.vdso;
+
+ if (in_ia32_syscall() && image == &vdso_image_32) {
+ if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad ||
+ regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad)
+ return true;
+ }
+#endif
+ return false;
+}
+
+#ifdef CONFIG_X86_64
+static __init int vdso_setup(char *s)
+{
+ vdso64_enabled = simple_strtoul(s, NULL, 0);
+ return 1;
+}
+__setup("vdso=", vdso_setup);
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
new file mode 100644
index 0000000000..d77d278ee9
--- /dev/null
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+#include <asm/errno.h>
+#include <asm/enclu.h>
+
+#include "extable.h"
+
+/* Relative to %rbp. */
+#define SGX_ENCLAVE_OFFSET_OF_RUN 16
+
+/* The offsets relative to struct sgx_enclave_run. */
+#define SGX_ENCLAVE_RUN_TCS 0
+#define SGX_ENCLAVE_RUN_LEAF 8
+#define SGX_ENCLAVE_RUN_EXCEPTION_VECTOR 12
+#define SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE 14
+#define SGX_ENCLAVE_RUN_EXCEPTION_ADDR 16
+#define SGX_ENCLAVE_RUN_USER_HANDLER 24
+#define SGX_ENCLAVE_RUN_USER_DATA 32 /* not used */
+#define SGX_ENCLAVE_RUN_RESERVED_START 40
+#define SGX_ENCLAVE_RUN_RESERVED_END 256
+
+.code64
+.section .text, "ax"
+
+SYM_FUNC_START(__vdso_sgx_enter_enclave)
+ /* Prolog */
+ .cfi_startproc
+ push %rbp
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rbp, 0
+ mov %rsp, %rbp
+ .cfi_def_cfa_register %rbp
+ push %rbx
+ .cfi_rel_offset %rbx, -8
+
+ mov %ecx, %eax
+.Lenter_enclave:
+ /* EENTER <= function <= ERESUME */
+ cmp $EENTER, %eax
+ jb .Linvalid_input
+ cmp $ERESUME, %eax
+ ja .Linvalid_input
+
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rcx
+
+ /* Validate that the reserved area contains only zeros. */
+ mov $SGX_ENCLAVE_RUN_RESERVED_START, %rbx
+1:
+ cmpq $0, (%rcx, %rbx)
+ jne .Linvalid_input
+ add $8, %rbx
+ cmpq $SGX_ENCLAVE_RUN_RESERVED_END, %rbx
+ jne 1b
+
+ /* Load TCS and AEP */
+ mov SGX_ENCLAVE_RUN_TCS(%rcx), %rbx
+ lea .Lasync_exit_pointer(%rip), %rcx
+
+ /* Single ENCLU serving as both EENTER and AEP (ERESUME) */
+.Lasync_exit_pointer:
+.Lenclu_eenter_eresume:
+ enclu
+
+ /* EEXIT jumps here unless the enclave is doing something fancy. */
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+ /* Set exit_reason. */
+ movl $EEXIT, SGX_ENCLAVE_RUN_LEAF(%rbx)
+
+ /* Invoke userspace's exit handler if one was provided. */
+.Lhandle_exit:
+ cmpq $0, SGX_ENCLAVE_RUN_USER_HANDLER(%rbx)
+ jne .Linvoke_userspace_handler
+
+ /* Success, in the sense that ENCLU was attempted. */
+ xor %eax, %eax
+
+.Lout:
+ pop %rbx
+ leave
+ .cfi_def_cfa %rsp, 8
+ RET
+
+ /* The out-of-line code runs with the pre-leave stack frame. */
+ .cfi_def_cfa %rbp, 16
+
+.Linvalid_input:
+ mov $(-EINVAL), %eax
+ jmp .Lout
+
+.Lhandle_exception:
+ mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
+
+ /* Set the exception info. */
+ mov %eax, (SGX_ENCLAVE_RUN_LEAF)(%rbx)
+ mov %di, (SGX_ENCLAVE_RUN_EXCEPTION_VECTOR)(%rbx)
+ mov %si, (SGX_ENCLAVE_RUN_EXCEPTION_ERROR_CODE)(%rbx)
+ mov %rdx, (SGX_ENCLAVE_RUN_EXCEPTION_ADDR)(%rbx)
+ jmp .Lhandle_exit
+
+.Linvoke_userspace_handler:
+ /* Pass the untrusted RSP (at exit) to the callback via %rcx. */
+ mov %rsp, %rcx
+
+ /* Save struct sgx_enclave_exception %rbx is about to be clobbered. */
+ mov %rbx, %rax
+
+ /* Save the untrusted RSP offset in %rbx (non-volatile register). */
+ mov %rsp, %rbx
+ and $0xf, %rbx
+
+ /*
+ * Align stack per x86_64 ABI. Note, %rsp needs to be 16-byte aligned
+ * _after_ pushing the parameters on the stack, hence the bonus push.
+ */
+ and $-0x10, %rsp
+ push %rax
+
+ /* Push struct sgx_enclave_exception as a param to the callback. */
+ push %rax
+
+ /* Clear RFLAGS.DF per x86_64 ABI */
+ cld
+
+ /*
+ * Load the callback pointer to %rax and lfence for LVI (load value
+ * injection) protection before making the call.
+ */
+ mov SGX_ENCLAVE_RUN_USER_HANDLER(%rax), %rax
+ lfence
+ call *%rax
+
+ /* Undo the post-exit %rsp adjustment. */
+ lea 0x10(%rsp, %rbx), %rsp
+
+ /*
+ * If the return from callback is zero or negative, return immediately,
+ * else re-execute ENCLU with the positive return value interpreted as
+ * the requested ENCLU function.
+ */
+ cmp $0, %eax
+ jle .Lout
+ jmp .Lenter_enclave
+
+ .cfi_endproc
+
+_ASM_VDSO_EXTABLE_HANDLE(.Lenclu_eenter_eresume, .Lhandle_exception)
+
+SYM_FUNC_END(__vdso_sgx_enter_enclave)